def fetch_resource(url, tmpfile, api_key): response = requests.get( url, headers={"Authorization": api_key}, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now ) response.raise_for_status() cl = response.headers.get("content-length") try: if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( "Resource too large to download: {cl} > max ({max_cl}).". format(cl=cl, max_cl=MAX_CONTENT_LENGTH)) except ValueError: pass length = 0 for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise util.JobError( "Resource too large to process: {cl} > max ({max_cl}).".format( cl=length, max_cl=MAX_CONTENT_LENGTH)) tmpfile.write(chunk) tmpfile.seek(0)
def scan(task_id, payload): logger = init_logger(task_id, payload) logger.info(f"Starting job {task_id}") validate_payload(payload) data = payload["metadata"] ckan_url = data["ckan_url"] resource_id = data["resource_id"] api_key = payload.get("api_key") scan_result = scan_resource(logger, ckan_url, api_key, resource_id) response = { "status_code": scan_result.returncode, "description": scan_result.stdout.decode("utf-8"), } if scan_result.returncode not in STATUSES: raise util.JobError(json.dumps(response)) response["status_text"] = STATUSES[scan_result.returncode] if scan_result.returncode == 2: raise util.JobError(json.dumps(response)) logger.info( f"Completed scanning resource {resource_id}. Submitting result") return response
def convert(file, logger): file.seek(0) outfile = tempfile.TemporaryFile() wrapper_file = codecs.getwriter('utf-8')(outfile) events = map(floaten, ijson.parse(file)) features = ijson.common.items(events, 'features.item') writer = False for feature in features: try: if not writer: fieldnames = list(feature['properties'].keys()) writer = csv.DictWriter(wrapper_file, fieldnames=fieldnames, lineterminator=os.linesep) writer.writeheader() row = feature['properties'] writer.writerow(row) except KeyError as e: logger.exception(e) raise util.JobError( "GeoJSON feature must have a 'properties' field.") except ValueError as e: logger.exception(e) raise util.JobError( "Each GeoJSON feature must have the same properties in order to convert to table. " ) if not outfile.tell(): raise util.JobError("No valid features found in the GeoJSON") outfile.seek(0) return outfile
def scan_resource(logger, ckan_url, api_key, resource_id): try: resource = ckan_action("resource_show", ckan_url, api_key, {"id": resource_id}) except util.JobError: # try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = ckan_action("resource_show", ckan_url, api_key, {"id": resource_id}) url_type = resource.get("url_type") if url_type != "upload": raise util.JobError( f"Only resources of type 'upload' can be scanned. Received '{str(url_type)}'" ) url = resource.get("url") scheme = urlsplit(url).scheme if scheme not in ("http", "https", "ftp"): raise util.JobError( "Only http, https, and ftp resources may be fetched.") logger.info(f"Fetching from {url}") with tempfile.NamedTemporaryFile() as tmp: try: fetch_resource(url, tmp, api_key) except RequestException as e: raise util.JobError(str(e)) logger.info(f"Scanning {tmp.name}") try: scan_result = scan_file(tmp.name) except (subprocess.SubprocessError, subprocess.TimeoutExpired) as e: raise util.JobError(str(e)) return scan_result
def validate_payload(payload): if "metadata" not in payload: raise util.JobError("Metadata missing") metadata = payload["metadata"] if "resource_id" not in metadata: raise util.JobError("No id provided.") if "ckan_url" not in metadata: raise util.JobError("No ckan_url provided.") if not payload.get("api_key"): raise util.JobError("No CKAN API key provided")
def validate_input(input): # Especially validate metdata which is provided by the user if not 'metadata' in input: raise util.JobError('Metadata missing') data = input['metadata'] if not 'resource_id' in data: raise util.JobError('No id provided.') if not 'ckan_url' in data: raise util.JobError('No ckan_url provided.') if not input.get('api_key'): raise util.JobError('No CKAN API key provided')
def datastore_resource_exists(resource_id, api_key, ckan_url): try: search_url = get_url('datastore_search', ckan_url) response = requests.post(search_url, verify=SSL_VERIFY, params={ 'id': resource_id, 'limit': 0 }, headers={ 'Content-Type': 'application/json', 'Authorization': api_key }) if response.status_code == 404: return False elif response.status_code == 200: return response.json().get('result', {'fields': []}) else: raise HTTPError( 'Error getting datastore resource.', response.status_code, search_url, response, ) except requests.exceptions.RequestException as e: raise util.JobError( 'Error getting datastore resource ({!s}).'.format(e))
def datastore_resource_exists(resource_id, api_key, ckan_url): try: search_url = get_url('datastore_search', ckan_url) response = requests.post(search_url, params={'id': resource_id, 'limit': 0}, headers={'Content-Type': 'application/json', 'Authorization': api_key} ) if response.status_code == 404: return False elif response.status_code == 200: return True else: raise util.JobError('Error getting datastore resource.') except requests.exceptions.RequestException: raise util.JobError('Error getting datastore resource.')
def echo_raw(task_id, input_): if input_['data'].startswith('>'): raise util.JobError('Do not start message with >') def raw(): for x in sorted(input_['data']): yield x return raw
def datastore_resource_exists(resource_id, api_key, ckan_url): try: search_url = get_url('datastore_search', ckan_url) response = requests.post(search_url, params={'id': resource_id, 'limit': 0}, headers={'Content-Type': 'application/x-www-form-urlencoded', 'Authorization': api_key} ) if response.status_code == 404: logging.debug('Resource not found in db, creating') return False elif response.status_code == 200: logging.debug('Resource exists in db') return True else: raise util.JobError('Error getting datastore resource.') except requests.exceptions.RequestException: raise util.JobError('Error getting datastore resource.')
def download_file(resource, file_format): tmpname = None if 'SHP' == file_format: tmpname = '{0}.{1}'.format(uuid.uuid1(), 'shp.zip') elif 'KML' == file_format: tmpname = '{0}.{1}'.format(uuid.uuid1(), 'kml') elif 'KMZ' == file_format: tmpname = '{0}.{1}'.format(uuid.uuid1(), 'kml.zip') elif 'GRID' == file_format: tmpname = '{0}.{1}'.format(uuid.uuid1(), 'zip') if tmpname is None: raise util.JobError("Failed to recognize file format extension {0}".format(file_format)) logger.info('Fetching from: {0}'.format(resource.get('url'))) try: request = urllib2.Request(resource.get('url')) if resource.get('url_type') == 'upload': request.add_header('Authorization', data['api_key']) response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT) except urllib2.HTTPError as e: raise HTTPError( "SpatialIngestor received a bad HTTP response when trying to download " "the data file", status_code=e.code, request_url=resource.get('url'), response=e.read()) except urllib2.URLError as e: if isinstance(e.reason, socket.timeout): raise util.JobError('Connection timed out after %ss' % DOWNLOAD_TIMEOUT) else: raise HTTPError( message=str(e.reason), status_code=None, request_url=resource.get('url'), response=None) try: with open(os.path.join(tempdir, tmpname), 'wb') as out_file: out_file.write(response.read()) except Exception, e: raise util.JobError("Failed to copy file to {0} with exception {1}".format(os.path.join(tempdir, tmpname), str(e)))
def delete_datastore_resource(resource_id, api_key, ckan_url): try: delete_url = get_url('datastore_delete', ckan_url) response = requests.post(delete_url, data=json.dumps({'id': resource_id, 'force': True}), headers={'Content-Type': 'application/json', 'Authorization': api_key} ) check_response(response, delete_url, 'CKAN', good_status=(201, 200, 404), ignore_no_success=True) except requests.exceptions.RequestException: raise util.JobError('Deleting existing datastore failed.')
def get_spatial_input_format(resource): check_string = resource.get('__extras', {}).get('format', resource.get('format', resource.get('url', ''))).upper() if any([check_string.endswith(x) for x in ["SHP", "SHAPEFILE"]]): return 'SHP' elif check_string.endswith("KML"): return 'KML' elif check_string.endswith("KMZ"): return 'KMZ' elif check_string.endswith("GRID"): return 'GRID' else: raise util.JobError("Failed to determine spatial file type for {0}".format(resource.get('url', '')))
def validate_input(input): # Especially validate metdata which is provided by the user if not 'metadata' in input: raise util.JobError('Metadata missing') if not 'api_key' in input: raise util.JobError('CKAN API key missing') required_metadata_keys = { 'resource_id', 'ckan_url', 'postgis', 'geoserver', 'geoserver_public_url', 'target_spatial_formats' } missing_metadata_keys = required_metadata_keys - set(input['metadata'].keys()) if missing_metadata_keys: raise util.JobError('Missing metadata keys: {0}'.format(missing_metadata_keys)) required_db_metadata_keys = { 'db_host', 'db_name', 'db_user', 'db_pass' } missing_db_metadata_keys = required_db_metadata_keys - set(input['metadata']['postgis'].keys()) if missing_db_metadata_keys: raise util.JobError('Missing DB metadata keys: {0}'.format(missing_db_metadata_keys)) required_geoserver_metadata_keys = required_db_metadata_keys missing_geoserver_metadata_keys = required_geoserver_metadata_keys - set(input['metadata']['geoserver'].keys()) if missing_geoserver_metadata_keys: raise util.JobError('Missing Geoserver metadata keys: {0}'.format(missing_geoserver_metadata_keys))
def check_response(response, request_url, who, good_status=(201, 200), ignore_no_success=False): """ Checks the response and raises exceptions if something went terribly wrong :param who: A short name that indicated where the error occurred (for example "CKAN") :param good_status: Status codes that should not raise an exception """ if not response.status_code: raise util.JobError( '{who} bad response with no status code at: {url}'.format( who=who, url=request_url)) message = '{who} bad response. Status code: {code} {reason}. At: {url}. Response: {resp}' try: if not response.status_code in good_status: json_response = response.json() if not ignore_no_success or json_response.get('success'): raise util.JobError( message.format(who=who, code=response.status_code, reason=response.reason, url=request_url, resp=pprint.pformat(json_response))) except ValueError: raise util.JobError( message.format(who=who, code=response.status_code, reason=response.reason, url=request_url, resp=response.text[:200]))
def get_db_cursor(data): db_port = None if data['postgis'].get('db_port', '') != '': db_port = data['postgis']['db_port'] try: connection = psycopg2.connect(dbname=data['postgis']['db_name'], user=data['postgis']['db_user'], password=data['postgis']['db_pass'], host=data['postgis']['db_host'], port=db_port) connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) return connection.cursor(), connection except Exception, e: raise util.JobError("Failed to connect with PostGIS with error {0}".format(str(e)))
def ckan_action(action, ckan_url, api_key, payload): url = get_url(action, ckan_url) try: r = requests.post( url, verify=SSL_VERIFY, data=json.dumps(payload), headers={ "Content-Type": "application/json", "Authorization": api_key }, ) r.raise_for_status() except RequestException as e: raise util.JobError(f"{str(e)} with payload {json.dumps(payload)}") return r.json()["result"]
def push_to_datastore(task_id, input, dry_run=False): '''Download and parse a resource push its data into CKAN's DataStore. An asynchronous job that gets a resource from CKAN, downloads the resource's data file and, if the data file has changed since last time, parses the data and posts it into CKAN's DataStore. :param dry_run: Fetch and parse the data file but don't actually post the data to the DataStore, instead return the data headers and rows that would have been posted. :type dry_run: boolean ''' handler = util.StoringHandler(task_id, input) logger = logging.getLogger(task_id) logger.addHandler(handler) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource = get_resource(resource_id, ckan_url, api_key) except util.JobError as e: # try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now ) response.raise_for_status() cl = response.headers.get('content-length') try: if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to download: {cl} > max ({max_cl}).' .format(cl=cl, max_cl=MAX_CONTENT_LENGTH)) except ValueError: pass tmp = tempfile.TemporaryFile() length = 0 m = hashlib.md5() for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to process: {cl} > max ({max_cl}).' .format(cl=length, max_cl=MAX_CONTENT_LENGTH)) tmp.write(chunk) m.update(chunk) ct = response.headers.get('content-type', '').split(';', 1)[0] except requests.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.response.status_code, request_url=url, response=e.response.content) except requests.RequestException as e: raise HTTPError( message=str(e), status_code=None, request_url=url, response=None) file_hash = m.hexdigest() tmp.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info("The file hash hasn't changed: {hash}.".format( hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except: raise util.JobError(e) get_row_set = web.app.config.get('GET_ROW_SET', lambda table_set: table_set.tables.pop()) row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id, api_key, ckan_url) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue if isinstance(cell.value, str): try: data_row[column_name] = cell.value.encode('latin-1').decode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): data_row[column_name] = cell.value else: data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id, api_key, ckan_url) headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types)] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) if dry_run: return headers_dicts, result count = 0 for i, chunk in enumerate(chunky(result, 250)): records, is_it_the_last_chunk = chunk count += len(records) logger.info('Saving chunk {number} {is_last}'.format( number=i, is_last='(last)' if is_it_the_last_chunk else '')) send_resource_to_datastore(resource, headers_dicts, records, is_it_the_last_chunk, api_key, ckan_url) logger.info('Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) if data.get('set_url_type', False): update_resource(resource, api_key, ckan_url)
def failing(task_id, input_): time.sleep(0.1) raise util.JobError('failed')
def example(task_id, input_): if 'time' not in input_['data']: raise util.JobError('time not in input') time.sleep(input_['data']['time']) return 'Slept for ' + str(input_['data']['time']) + ' seconds.'
def echo(task_id, input_): if input_['data'].startswith('>'): raise util.JobError('Do not start message with >') if input_['data'].startswith('#'): raise Exception('Something went totally wrong') return '>' + input_['data']
tempdir = tempfile.mkdtemp() try: native_crs = "EPSG:4326" unzip_dir = None base_filepath = download_file(parent_resource, input_format) # Do we need to unzip? if input_format in ["KMZ", "SHP", "GRID"]: try: zpf = zipfile.ZipFile(base_filepath) unzip_dir = unzip_file(zpf, base_filepath) except: raise util.JobError("{0} is not a valid zip file".format(base_filepath)) # Flatten the zip file for root, dirs, files in os.walk(unzip_dir): for sub_dir in dirs: from_dir = os.path.join(root, sub_dir) for f in getfiles(from_dir): filename = f.split('/')[-1] if os.path.isfile(os.path.join(unzip_dir, filename)): filename = f.replace(from_dir, "", 1).replace("/", "_") shutil.copy(f, os.path.join(unzip_dir, filename)) shutil.rmtree(from_dir) for f in os.listdir(unzip_dir): if f.lower().endswith(".kml"): kml_file = os.path.join(unzip_dir, f)
if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources request.add_header('Authorization', api_key) response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT) except urllib2.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.code, request_url=resource.get('url'), response=e.read()) except urllib2.URLError as e: if isinstance(e.reason, socket.timeout): raise util.JobError('Connection timed out after %ss' % DOWNLOAD_TIMEOUT) else: raise HTTPError(message=str(e.reason), status_code=None, request_url=resource.get('url'), response=None) cl = response.info().getheader('content-length') if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to download: {cl} > max ({max_cl}).'.format( cl=cl, max_cl=MAX_CONTENT_LENGTH)) ct = response.info().getheader('content-type').split(';', 1)[0] file_content = response.read()
resource = get_resource(resource_id, ckan_url, api_key) except util.JobError, e: #try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlparse.urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.') # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now
#try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # fetch the resource data logger.info('Fetching from: {0}'.format(resource.get('url'))) try: request = urllib2.Request(resource.get('url')) if request.get_type().lower() not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.') if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources request.add_header('Authorization', api_key) response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT) except urllib2.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.code, request_url=resource.get('url'), response=e.read()) except urllib2.URLError as e:
def echo(task_id, input): if input['data'].startswith('>'): raise util.JobError('do not start message with >') if input['data'].startswith('#'): raise Exception('serious exception') return '>' + input['data']
def spatial_ingest(task_id, input): handler = util.StoringHandler(task_id, input) logger = logging.getLogger(task_id) logger.addHandler(handler) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] data['api_key'] = input['api_key'] logger.info('Retrieving resource information') resource = ckan_command('resource_show', {'id': data['resource_id']}, data) logger.info('Retrieving package information') package = ckan_command('package_show', {'id': resource['package_id']}, data) logger.info('Purging any legacy spatial ingestor assets') # Make sure there are no legacy resources or artifacts purge_legacy_spatial(data, package, logger) # Get package data again in case another thread deleted some legacy resources package = ckan_command('package_show', {'id': resource['package_id']}, data) # We have an ingestible resource that has been updated, passing all blacklist checks # and we have potential resources for creation. logger.info('Setting up PostGIS table for spatial assets') table_name = setup_spatial_table(data, resource) # Determine input format logger.info('Determining input format for resource') input_format = get_spatial_input_format(resource) # Ingest into DB and exit if this fails for whatever reason logger.info('Ingesting spatial asset into PostGIS DB') native_crs = db_upload(data, resource, input_format, table_name, logger) # Create Geoserver assets for PostGIS table logger.info('Creating Geoserver assets for PostGIS table') workspace, layer, bbox_obj = geoserver_transfer(data, package, input_format, native_crs, table_name, logger) # Figure out if any target formats are available to be expanded into. # I.e. if a resource of a target format already exists and is _not_ # last modified by the spatial ingestor user, we do not added/update the # resource for that format. expansion_formats = get_spatial_upload_formats(data, package, input_format) if not expansion_formats: raise util.JobError("Package {0} has no available formats to expand into".format(package['name'])) logger.info("Creating CKAN resources for new Geoserver assets") num_update = create_or_update_resources(data, package, resource, bbox_obj, expansion_formats, layer, workspace, logger) logger.info("{0} resources successfully created/updated".format(num_update))