def validate_input(input): # Especially validate metadata which is provided by the user if 'metadata' not in input: raise JobError('Metadata missing') data = input['metadata'] if 'resource_id' not in data: raise JobError('No id provided.') if 'ckan_url' not in data: raise JobError('No ckan_url provided.') if not input.get('api_key'): raise JobError('No CKAN API key provided')
def xloader_data_into_datastore_(input, job_dict): '''This function: * downloads the resource (metadata) from CKAN * downloads the data * calls the loader to load the data into DataStore * calls back to CKAN with the new status (datapusher called this function 'push_to_datastore') ''' job_id = get_current_job().id db.init(config) # Store details of the job in the db try: db.add_pending_job(job_id, **input) except sa.exc.IntegrityError: raise JobError('job_id {} already exists'.format(job_id)) # Set-up logging to the db handler = StoringHandler(job_id, input) level = logging.DEBUG handler.setLevel(level) logger = logging.getLogger(job_id) handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(handler) # also show logs on stderr logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource, dataset = get_resource_and_dataset(resource_id) except JobError, e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) resource, dataset = get_resource_and_dataset(resource_id)
resource, dataset = get_resource_and_dataset(resource_id) resource_ckan_url = '/dataset/{}/resource/{}' \ .format(dataset['name'], resource['id']) logger.info('Express Load starting: {}'.format(resource_ckan_url)) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Ignoring resource - url_type=datastore - dump files are ' 'managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlparse.urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise JobError('Only http, https, and ftp resources may be fetched.') # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) length = 0 m = hashlib.md5() cl = None try: headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key response = get_response(url, headers)
except JobError, e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Ignoring resource - url_type=datastore - dump files are ' 'managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlparse.urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise JobError('Only http, https, and ftp resources may be fetched.') # fetch the resource data logger.info('Fetching from: {0}'.format(url)) try: headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY,
response = requests.get(resource.get('url'), headers=headers, timeout=DOWNLOAD_TIMEOUT) response.raise_for_status() except requests.exceptions.HTTPError as error: # status code error logger.error('HTTP error: {}'.format(error)) raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=error.response.status_code, request_url=resource.get('url'), response=error) except requests.exceptions.Timeout: logger.error('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) raise JobError( 'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) except AttributeError: err_message = str(e) logger.error('URL error: {}'.format(err_message)) raise HTTPError(message=err_message, status_code=None, request_url=resource.get('url'), response=None) logger.info('Downloaded ok') cl = response.headers.get('content-length') if cl and int(cl) > MAX_CONTENT_LENGTH: error_msg = 'Resource too large to download: {cl} > max ({max_cl}).'\
def _download_resource_data(resource, data, api_key, logger): '''Downloads the resource['url'] as a tempfile. :param resource: resource (i.e. metadata) dict (from the job dict) :param data: job dict - may be written to during this function :param api_key: CKAN api key - needed to obtain resources that are private :param logger: If the download is bigger than MAX_CONTENT_LENGTH then it just downloads a excerpt (of MAX_EXCERPT_LINES) for preview, and flags it by setting data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' # check scheme url = resource.get('url') scheme = urlparse.urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) length = 0 m = hashlib.md5() cl = None try: headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key response = get_response(url, headers) cl = response.headers.get('content-length') if cl and int(cl) > MAX_CONTENT_LENGTH: raise DataTooBigError() # download the file to a tempfile on disk for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise DataTooBigError tmp_file.write(chunk) m.update(chunk) data['datastore_contains_all_records_of_source_file'] = True except DataTooBigError: tmp_file.close() message = 'Data too large to load into Datastore: ' \ '{cl} bytes > max {max_cl} bytes.' \ .format(cl=cl or length, max_cl=MAX_CONTENT_LENGTH) logger.warning(message) if MAX_EXCERPT_LINES <= 0: raise JobError(message) logger.info('Loading excerpt of ~{max_lines} lines to ' 'DataStore.' .format(max_lines=MAX_EXCERPT_LINES)) tmp_file = get_tmp_file(url) response = get_response(url, headers) length = 0 line_count = 0 m = hashlib.md5() for line in response.iter_lines(CHUNK_SIZE): tmp_file.write(line + '\n') m.update(line) length += len(line) line_count += 1 if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES: break data['datastore_contains_all_records_of_source_file'] = False except requests.exceptions.HTTPError as error: # status code error logger.debug('HTTP error: {}'.format(error)) raise HTTPError( "Xloader received a bad HTTP response when trying to download " "the data file", status_code=error.response.status_code, request_url=url, response=error) except requests.exceptions.Timeout: logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) raise JobError('Connection timed out after {}s'.format( DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) except AttributeError: err_message = str(e) logger.warning('URL error: {}'.format(err_message)) raise HTTPError( message=err_message, status_code=None, request_url=url, response=None) logger.info('Downloaded ok - %s', printable_file_size(length)) file_hash = m.hexdigest() tmp_file.seek(0) return tmp_file, file_hash
def xloader_data_into_datastore_(input, job_dict): '''This function: * downloads the resource (metadata) from CKAN * downloads the data * calls the loader to load the data into DataStore * calls back to CKAN with the new status (datapusher called this function 'push_to_datastore') ''' job_id = get_current_job().id db.init(config) # Store details of the job in the db try: db.add_pending_job(job_id, **input) except sa.exc.IntegrityError: raise JobError('job_id {} already exists'.format(job_id)) # Set-up logging to the db handler = StoringHandler(job_id, input) level = logging.DEBUG handler.setLevel(level) logger = logging.getLogger(job_id) handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(handler) # also show logs on stderr logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource, dataset = get_resource_and_dataset(resource_id) except (JobError, ObjectNotFound) as e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) resource, dataset = get_resource_and_dataset(resource_id) resource_ckan_url = '/dataset/{}/resource/{}' \ .format(dataset['name'], resource['id']) logger.info('Express Load starting: {}'.format(resource_ckan_url)) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Ignoring resource - url_type=datastore - dump files are ' 'managed with the Datastore API') return # download resource tmp_file, file_hash = _download_resource_data(resource, data, api_key, logger) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info('Ignoring resource - the file hash hasn\'t changed: ' '{hash}.'.format(hash=file_hash)) return logger.info('File hash: {}'.format(file_hash)) resource['hash'] = file_hash def direct_load(): fields = loader.load_csv( tmp_file.name, resource_id=resource['id'], resource_alias=resource['name'], mimetype=resource.get('format'), logger=logger) loader.calculate_record_count( resource_id=resource['id'], logger=logger) set_datastore_active(data, resource, logger) job_dict['status'] = 'running_but_viewable' callback_xloader_hook(result_url=input['result_url'], api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: {}'.format(resource_ckan_url)) loader.create_column_indexes( fields=fields, resource_id=resource['id'], logger=logger) update_resource(resource={'id': resource['id'], 'hash': resource['hash']}, patch_only=True) logger.info('File Hash updated for resource: {}'.format(resource['hash'])) def messytables_load(): try: loader.load_table(tmp_file.name, resource_id=resource['id'], resource_alias=resource['name'], mimetype=resource.get('format'), logger=logger) except JobError as e: logger.error('Error during messytables load: {}'.format(e)) raise loader.calculate_record_count( resource_id=resource['id'], logger=logger) set_datastore_active(data, resource, logger) logger.info('Finished loading with messytables') update_resource(resource={'id': resource['id'], 'hash': resource['hash']}, patch_only=True) logger.info('File Hash updated for resource: {}'.format(resource['hash'])) # Load it logger.info('Loading CSV') just_load_with_messytables = asbool(config.get( 'ckanext.xloader.just_load_with_messytables', False)) logger.info("'Just load with messytables' mode is: {}".format( just_load_with_messytables)) try: if just_load_with_messytables: messytables_load() else: try: direct_load() except JobError as e: logger.warning('Load using COPY failed: {}'.format(e)) logger.info('Trying again with messytables') messytables_load() except FileCouldNotBeLoadedError as e: logger.warning('Loading excerpt for this format not supported.') logger.error('Loading file raised an error: {}'.format(e)) raise JobError('Loading file raised an error: {}'.format(e)) tmp_file.close() logger.info('Express Load completed')