def main(**kwargs): selected_job_codes = kwargs.get('selected_job_codes', []) use_local_files = kwargs.get('use_local_files', False) clear_first = kwargs.get('clear_first', False) wipe_data = kwargs.get('wipe_data', False) migrate_schema = kwargs.get('migrate_schema', False) test_mode = kwargs.get('test_mode', False) if selected_job_codes == []: selected_jobs = [Job(job_dict) for job_dict in job_dicts] else: selected_jobs = select_jobs_by_code(selected_job_codes, job_dicts) # [ ] Add in script-level post-processing here, allowing the data.json file of an ArcGIS # server to be searched for unharvested tables. for job in selected_jobs: kwparameters = dict(kwargs) package_id = get_package_id( job, test_mode) # This stuff needs to be tested. resource_id = find_resource_id(package_id, job.resource_name) if migrate_schema and 'ckan' in job.destinations: # Delete the Data Table view to avoid new fields not getting added to an existing view. delete_datatable_views(resource_id) # Is this really necessary though? In etl_util.py, migrate_schema being True is already going to force clear_first to be True # which should delete all the views. # The scenario of concern is when the schema changes by eliminating a field, and it's not clear whether CKAN # supports just dropping a field from the schema and auto-dropping the field from the table while preserving # the other values. print( "Note that setting migrate_schema = True is going to clear the associated datastore." ) if ( clear_first or migrate_schema ) and 'ckan' in job.destinations: # if the target is a CKAN resource being cleared # [ ] Maybe add a check to see if an integrated data dictionary exists. data_dictionary = get_data_dictionary( resource_id) # If so, obtain it. # Save it to a local file as a backup. data_dictionary_filepath = save_to_waiting_room( data_dictionary, resource_id, job.resource_name) # wipe_data should preserve the data dictionary when the schema stays the same, and # migrate_schema should be used to change the schema but try to preserve the data dictionary. # If migrate_schema == True, 1) backup the data dictionary, # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary. # Or could we overload "wipe_data" to include schema migration? # [ ] Also, it really seems that always_clear_first should become always_wipe_data. locators_by_destination = job.process_job(**kwparameters) for destination, table_locator in locators_by_destination.items(): if destination == 'ckan': post_process(locators_by_destination[destination]) if clear_first or migrate_schema: # [ ] Should the data dictionary definitely be restored if clear_first = True? results = set_data_dictionary(resource_id, data_dictionary)
def express_load_then_delete_file(job, **kwparameters): """The basic idea is that the job processes with a 'file' destination, so the ETL job loads the file into destination_file_path. Then as a custom post-processing step, that file is Express-Loaded. This is faster (particularly for large files) and avoids 504 errors and unneeded API requests.""" # Eventually this function should be moved either to etl_util.py or # more likely the pipeline framework. In either case, this approach # can be formalized, either as a destination or upload method and # possibly implemented as a loader (CKANExpressLoader). if kwparameters['use_local_output_file']: return if kwparameters['test_mode']: job.package_id = TEST_PACKAGE_ID ckan = ckanapi.RemoteCKAN(site, apikey=API_key) csv_file_path = job.destination_file_path resource_id = find_resource_id(job.package_id, job.resource_name) if resource_id is None: # If the resource does not already exist, create it. print( f"Unable to find a resource with name '{job.resource_name}' in package with ID {job.package_id}." ) print( f"Creating new resource, and uploading CSV file {csv_file_path} to resource with name '{job.resource_name}' in package with ID {job.package_id}." ) resource_as_dict = ckan.action.resource_create( package_id=job.package_id, name=job.resource_name, upload=open(csv_file_path, 'r')) else: print( f"Uploading CSV file {csv_file_path} to resource with name '{job.resource_name}' in package with ID {job.package_id}." ) resource_as_dict = ckan.action.resource_patch(id=resource_id, upload=open( csv_file_path, 'r')) # Running resource_update once sets the file to the correct file and triggers some datastore action and # the Express Loader, but for some reason, it seems to be processing the old file. # So instead, let's run resource_patch (which just sets the file) and then run resource_update. #resource_as_dict = ckan.action.resource_update(id = resource_id) resource_as_dict = ckan.action.resource_update(id=resource_id, upload=open( csv_file_path, 'r')) print(f"Removing temp file at {csv_file_path}") os.remove(csv_file_path) # Since launchpad.py doesn't update the last_etl_update metadata value in this case # because this is a workaround, do it manually here: post_process(resource_id, job, **kwparameters)
def process_job(**kwparameters): job = kwparameters['job'] use_local_files = kwparameters['use_local_files'] clear_first = kwparameters['clear_first'] test_mode = kwparameters['test_mode'] job.default_setup(use_local_files) # [ ] Check whether this process_job function can be put into standard form. job.loader_config_string = 'production' if OVERRIDE_GEOCODING: # This part may not convert well to the Job class approach. job.target = '/Users/drw/WPRDC/etl/rocket-etl/archives/previously-geocoded-restaurants.csv' job.source_connector = pl.FileConnector job.source_type = 'local' job.connector_config_string = '' print("Using local archive file: {}".format(target)) elif use_local_files: job.target = SOURCE_DIR + job.source_file else: job.target = job.source_dir + "/" + job.source_file package_id = job.package if not test_mode else TEST_PACKAGE_ID print("==============\n {} in package {}".format(job.resource_name, package_id)) if clear_first: print("Clearing the datastore for {}".format(job.resource_name)) # Upload data to datastore print('Uploading tabular data...') curr_pipeline = pl.Pipeline(job.resource_name + ' pipeline', job.resource_name + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \ .connect(job.source_connector, job.target, config_string=job.connector_config_string, encoding=job.encoding) \ .extract(pl.CSVExtractor, firstline_headers=True) \ .schema(job.schema) \ .load(pl.CKANDatastoreLoader, job.loader_config_string, #fields=schema().serialize_to_ckan_fields(), fields=job.schema().serialize_to_ckan_fields(capitalize=False), key_fields=job.primary_key_fields, package_id=package_id, resource_name=job.resource_name, clear_first=clear_first, method=job.upload_method).run() resource_id = find_resource_id(package_id, resource_name) locators_by_destination = {destinations[0]: resource_id} assert len(destinations) == 1 return locators_by_destination
def get_extant_time_range(job, **kwparameters): # if 'ckan' in job.destinations: # This is a strong argument for making each job_dict # have exactly one source and one destination and using job molecules or # chains to support multiple destinations (somehow). ## JOB CHAINS: 1) Support building up more complicated processes (often ## represented by "directed acyclic graphs") by chaining job atoms. ## 2) More coherently support multiple destinations at this level by ## calling the same job atom twice, with different destinations. ## I can imagine that a job atom could be the basis for a superjob ## which could replace multiple fields with lists, and some ## launchpad logic would iterate over these lists ## like ## destination = ['file', 'ckan_filestore'] ## destination_filename = ['local_filename.csv', 'Official-Looking CKAN Filename'] ## and apply the elements in sequence to make len(xs) jobs. ## 3) But try to make every parameter into a potential list this way ## (at least by allowing one parameter at a time to be changed). if job.destination == 'ckan': package = get_package_by_id(job.package_id) if 'extras' in package: extras_list = package['extras'] # Keep definitions and uses of extras metadata updated here: # https://github.com/WPRDC/data-guide/blob/master/docs/metadata_extras.md # The format is like this: # u'extras': [{u'key': u'dcat_issued', u'value': u'2014-01-07T15:27:45.000Z'}, ... # not a dict, but a list of dicts. extras = {d['key']: d['value'] for d in extras_list} resource_id = find_resource_id( job.package_id, job.resource_name) # This adds a second call to get the # package when it's already been obtained a few lines above. if resource_id is None: # The resource does not yet exist. return None, None if 'time_field' in extras and resource_id in json.loads( extras['time_field']): time_field_lookup = json.loads(extras['time_field']) first_date, last_date = find_extreme_dates( resource_id, time_field_lookup) return first_date, last_date else: try: time_field = getattr( job, 'time_field', None) # Try to grab time_field from job except AttributeError: return None, None else: time_field_lookup = {resource_id: time_field} first_date, last_date = find_extreme_dates( resource_id, time_field_lookup) return first_date, last_date return None, None else: return None, None else: # Find the time range of a non-datastore CSV file at the local destination # OR wipe the existing file and rewrite it from scratch. if job.destination == 'file': try: f = open(job.destination_file_path, 'r') except FileNotFoundError: return None, None else: reader = csv.DictReader(f) first_date = datetime.max last_date = datetime.min for row in reader: timestamp = row.get(job.time_field, None) if timestamp is not None: timestamp = parser.parse(timestamp) if timestamp < first_date: first_date = timestamp if timestamp > last_date: last_date = timestamp f.close() if first_date <= last_date: return first_date, last_date else: return None, None else: raise ValueError( f"Unable to determine the extant time range for the following destination: {job.destination}" )