def process_job(**kwparameters): job = kwparameters['job'] use_local_files = kwparameters['use_local_files'] clear_first = kwparameters['clear_first'] test_mode = kwparameters['test_mode'] job.default_setup(use_local_files) # [ ] Check whether this process_job function can be put into standard form. job.loader_config_string = 'production' if OVERRIDE_GEOCODING: # This part may not convert well to the Job class approach. job.target = '/Users/drw/WPRDC/etl/rocket-etl/archives/previously-geocoded-restaurants.csv' job.source_connector = pl.FileConnector job.source_type = 'local' job.connector_config_string = '' print("Using local archive file: {}".format(target)) elif use_local_files: job.target = SOURCE_DIR + job.source_file else: job.target = job.source_dir + "/" + job.source_file package_id = job.package if not test_mode else TEST_PACKAGE_ID print("==============\n {} in package {}".format(job.resource_name, package_id)) if clear_first: print("Clearing the datastore for {}".format(job.resource_name)) # Upload data to datastore print('Uploading tabular data...') curr_pipeline = pl.Pipeline(job.resource_name + ' pipeline', job.resource_name + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \ .connect(job.source_connector, job.target, config_string=job.connector_config_string, encoding=job.encoding) \ .extract(pl.CSVExtractor, firstline_headers=True) \ .schema(job.schema) \ .load(pl.CKANDatastoreLoader, job.loader_config_string, #fields=schema().serialize_to_ckan_fields(), fields=job.schema().serialize_to_ckan_fields(capitalize=False), key_fields=job.primary_key_fields, package_id=package_id, resource_name=job.resource_name, clear_first=clear_first, method=job.upload_method).run() resource_id = find_resource_id(package_id, resource_name) locators_by_destination = {destinations[0]: resource_id} assert len(destinations) == 1 return locators_by_destination
def push_to_datastore(job, file_connector, target, config_string, encoding, loader_config_string, primary_key_fields, test_mode, clear_first, upload_method='upsert'): # This is becoming a legacy function because all the new features are going into run_pipeline, # but note that this is still used at present by a parking ETL job. # (wipe_data support is not being added to push_to_datastore.) package_id = job['package'] if not test_mode else TEST_PACKAGE_ID resource_name = job['resource_name'] schema = job['schema'] extractor = select_extractor(job) # Upload data to datastore if clear_first: print("Clearing the datastore for {}".format(job['resource_name'])) print('Uploading tabular data...') curr_pipeline = pl.Pipeline(job['resource_name'] + ' pipeline', job['resource_name'] + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \ .connect(file_connector, target, config_string=config_string, encoding=encoding) \ .extract(extractor, firstline_headers=True) \ .schema(schema) \ .load(pl.CKANDatastoreLoader, loader_config_string, fields=schema().serialize_to_ckan_fields(), key_fields=primary_key_fields, package_id=package_id, resource_name=resource_name, clear_first=clear_first, method=upload_method).run() resource_id = find_resource_id( package_id, resource_name ) # This IS determined in the pipeline, so it would be nice if the pipeline would return it. return resource_id
def run_pipeline(self, test_mode, clear_first, wipe_data, migrate_schema, use_local_files, retry_without_last_line=False, ignore_empty_rows=False): # This is a generalization of push_to_datastore() to optionally use # the new FileLoader (exporting data to a file rather than just CKAN). # target is a filepath which is actually the source filepath. # The retry_without_last_line option is a way of dealing with CSV files # that abruptly end mid-line. locators_by_destination = {} source_file_format = destination_file_format = self.source_file.split( '.')[-1].lower() # 1) The downside to extracting the file format from the source file name # is that it couples the source and destination a bit too tightly. # One can imagine a scenario where tabular data is obtained from an API # and it's supposed to be uploaded as a CSV file somewhere. In this case # a separate "destination_file_format" would need to be specified. # # 2) While wprdc_etl uses 'CSV' as a # format that it sends to CKAN, I'm inclined to switch to 'csv', # and uniformly lowercasing all file formats. # Though given a format of 'geojson', the CKAN API resource_create # response lists format as 'GeoJSON', so CKAN is doing some kind # of correction. for destination in self.destinations: package_id = get_package_id( self, test_mode) # This is the effective package ID, # taking into account whether test mode is active. # [ ] Maybe the use_local_files and test_mode and any other parameters should be applied in a discrete stage between initialization and running. # This would allow the source and destination parameters to be prepared, leaving the pipeline running to just run the pipeline. # However, writing the CKANFilestoreLoader is a prerequisite for this. #if self.source_type == 'sftp' and destination == 'ckan_filestore': if destination == 'ckan_filestore' and self.source_type not in [ 'sftp', 'local', 'http' ]: # [ ] Test local file uploads to the CKAN Filestore before deleting all this logic. # Maybe a pipeline is not necessary to just upload a file to a CKAN resource, if the file is local! ua = 'rocket-etl/1.0 (+https://tools.wprdc.org/)' ckan = ckanapi.RemoteCKAN(site, apikey=API_KEY, user_agent=ua) upload_kwargs = { 'package_id': package_id, 'format': destination_file_format, 'url': 'dummy-value', # ignored but required by CKAN<2.6 } if self.source_type in ['local']: upload_kwargs['upload'] = open( self.target, 'r') # target is the source file path # Specifying the target like this only works if the file is already local. # This can look like this: # <_io.TextIOWrapper name='/Users/drw/WPRDC/etl/rocket-etl/source_files/ac_hd/sourcesites.geojson' mode='r' encoding='UTF-8'> elif not use_local_files and self.source_type in [ 'http' ]: # [ ] Test this untested block. # This could also happen in default_setup, but really everything under "if destination == 'ckan_filestore'" # should be moved/eliminated once a CKANFileStoreLoader is written. self.target = cached_source_file_path = download_file_to_path( self.source_full_url, local_dir=WAITING_ROOM_DIR) upload_kwargs['upload'] = open( self.target, 'r') # target is the source file path elif not use_local_files and self.source_type in ['sftp']: #ftp_connector = pl.SFTPConnector(host = #ftp_file = ftp_connector.connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath) # Test the file-based pipeline by using FileExtractor, with the regular SFTP connector (as configured in the air_quality.py script # for the sourcesites.geojson file, and the new CKANFilestoreLoader (which still needs to be written). CKANFilestoreLoader # will only work if paired with another file-based Extractor. Also, the schema has got to be addressed somewhere maybe. loader = pl.CKANFilestoreLoader raise FileNotFoundError( "To get a file via FTP and upload to the CKAN filestore, it would be best to implement a proper CKANFileStoreLoader. This necessitates modifying how pipelines work a bit." ) if not resource_exists(package_id, self.resource_name): upload_kwargs['name'] = self.resource_name result = ckan.action.resource_create(**upload_kwargs) print( 'Creating new resource and uploading file to filestore...' ) else: upload_kwargs['id'] = find_resource_id( package_id, self.resource_name) result = ckan.action.resource_update(**upload_kwargs) print('Uploading file to filestore...') else: self.select_extractor() # BEGIN Destination-specific configuration if destination == 'ckan': loader = pl.CKANDatastoreLoader elif destination == 'file': loader = pl.FileLoader self.upload_method = 'insert' # Note that this will always append records to an existing file # unless 'always_clear_first' (or 'always_wipe_data') is set to True. elif destination == 'ckan_filestore': loader = pl.CKANFilestoreLoader else: raise ValueError( f"run_pipeline does not know how to handle destination = {destination}" ) clear_first = clear_first or self.always_clear_first or migrate_schema # If migrate_schema == True, 1) backup the data dictionary, # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary. # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function. wipe_data = wipe_data or self.always_wipe_data if clear_first and wipe_data: raise ValueError( "clear_first and wipe_data should not both be True simultaneously. To clear a datastore for a job that has always_wipe_data = True, add the command-line argument 'override_wipe_data'." ) elif clear_first: if destination in ['ckan']: if datastore_exists(package_id, self.resource_name): # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level). print("Clearing the datastore for {}".format( self.resource_name) ) # Actually done by the pipeline. else: print( "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False." ) clear_first = False elif wipe_data: if destination in ['ckan']: if datastore_exists(package_id, self.resource_name): print("Wiping records from the datastore for {}". format(self.resource_name)) else: print( "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False." ) wipe_data = False print( f'Uploading {"tabular data" if loader.has_tabular_output else "file"}...' ) # END Destination-specific configuration try: curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line, ignore_empty_rows = ignore_empty_rows, filters = self.filters) \ .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath) \ .extract(self.extractor, firstline_headers=True, rows_to_skip=self.rows_to_skip) \ .schema(self.schema) \ .load(loader, self.loader_config_string, filepath = self.destination_file_path, file_format = destination_file_format, fields = self.schema().serialize_to_ckan_fields(), key_fields = self.primary_key_fields, package_id = package_id, resource_name = self.resource_name, clear_first = clear_first, wipe_data = wipe_data, method = self.upload_method).run() except FileNotFoundError: if self.ignore_if_source_is_missing: print( "The source file for this job wasn't found, but that's not surprising." ) else: raise if destination in [ 'ckan', 'ckan_filestore', 'local_monthly_archive_zipped' ]: resource_id = find_resource_id( package_id, self.resource_name ) # This IS determined in the pipeline, so it would be nice if the pipeline would return it. locators_by_destination[destination] = resource_id elif destination in ['file']: locators_by_destination[ destination] = self.destination_file_path return locators_by_destination
def run_pipeline(self, test_mode, clear_first, wipe_data, migrate_schema, file_format='csv', retry_without_last_line=False): # This is a generalization of push_to_datastore() to optionally use # the new FileLoader (exporting data to a file rather than just CKAN). # target is a filepath which is actually the source filepath. # The retry_without_last_line option is a way of dealing with CSV files # that abruptly end mid-line. locators_by_destination = {} for destination in self.destinations: package_id = self.package if not test_mode else TEST_PACKAGE_ID # Should this be done elsewhere? # [ ] Maybe the use_local_files and test_mode and any other parameters should be applied in a discrete stage between initialization and running. if destination == 'ckan_filestore': # Maybe a pipeline is not necessary to just upload a file to a CKAN resource. ua = 'rocket-etl/1.0 (+https://tools.wprdc.org/)' ckan = ckanapi.RemoteCKAN(site, apikey=API_KEY, user_agent=ua) source_file_format = self.source_file.split('.')[-1].lower() # While wprdc_etl uses 'CSV' as a # format that it sends to CKAN, I'm inclined to switch to 'csv', # and uniformly lowercasing all file formats. # Though given a format of 'geojson', the CKAN API resource_create # response lists format as 'GeoJSON', so CKAN is doing some kind # of correction. upload_kwargs = { 'package_id': package_id, 'format': source_file_format, 'url': 'dummy-value', # ignored but required by CKAN<2.6 'upload': open(self.target, 'r') } # target is the source file path if not resource_exists(package_id, self.resource_name): upload_kwargs['name'] = self.resource_name result = ckan.action.resource_create(**upload_kwargs) print( 'Creating new resource and uploading file to filestore...' ) else: upload_kwargs['id'] = find_resource_id( package_id, self.resource_name) result = ckan.action.resource_update(**upload_kwargs) print('Uploading file to filestore...') elif destination == 'local_monthly_archive_zipped': # [ ] Break all of this LMAZ code off into one or more separate functions. loader = pl.FileLoader self.upload_method = 'insert' # Try using destination_directory and destination_file_path for the archives. # Append the year-month to the filename (before the extension). pathparts = self.destination_file_path.split('/') filenameparts = pathparts[-1].split('.') now = datetime.now() last_month_num = (now.month - 1) % 12 year = now.year if last_month_num == 0: last_month_num = 12 if last_month_num == 12: # If going back to December, year -= 1 # set year to last year last_month_str = str(last_month_num) if len(last_month_str) == 1: last_month_str = '0' + last_month_str regex_parts = list(filenameparts) filenameparts[-2] += "_{}-{}".format(year, last_month_str) timestamped_filename = '.'.join(filenameparts) regex_parts[-2] += "_{}".format(year) regex_pattern = '.'.join( regex_parts[:-1] ) # This is for matching filenames that should be rolled up into the same year of data. zip_file_name = regex_pattern + '.zip' pathparts[-1] = timestamped_filename destination_file_path = '/'.join( pathparts) # This is the new timestamped filepath. ic(self.destination_file_path) # Store the file locally # Zip the files with matching year in filename. destination_directory = '/'.join( self.destination_file_path.split('/')[:-1]) all_files = os.listdir(destination_directory) list_of_files_to_compress = sorted( [f for f in all_files if re.match(regex_pattern, f)]) #cp synthesized-liens.csv zipped/liens-with-current-status-beta.csv zip_file_path = destination_directory + '/' + zip_file_name #zip zipped/liens-with-current-status-beta.zip zipped/liens-with-current-status-beta.csv import zipfile process_zip = zipfile.ZipFile(zip_file_path, 'w') for original_file_name in list_of_files_to_compress: file_to_zip = destination_directory + '/' + original_file_name process_zip.write(file_to_zip, original_file_name, compress_type=zipfile.ZIP_DEFLATED) process_zip.close() # Upload the file at zip_file_path to the appropriate resource. #####resource_id = # [ ] This lmaz option needs to be finished. # Delete the file at zip_file_path. os.remove(zip_file_path) # Have the parameters that are being passed to curr_pipeline below correct for uploading the zipped archive? ######## else: self.select_extractor() if destination == 'ckan': loader = pl.CKANDatastoreLoader elif destination == 'file': loader = pl.FileLoader self.upload_method = 'insert' # Note that this will always append records to an existing file # unless 'always_clear_first' (or 'always_wipe_data') is set to True. else: raise ValueError( "run_pipeline does not know how to handle destination = {}" .format(destination)) clear_first = clear_first or self.always_clear_first or migrate_schema # If migrate_schema == True, 1) backup the data dictionary, # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary. # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function. wipe_data = wipe_data or self.always_wipe_data if clear_first and wipe_data: raise ValueError( "clear_first and wipe_data should not both be True simultaneously." ) elif clear_first: if destination in ['ckan']: if datastore_exists(package_id, self.resource_name): # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level). print("Clearing the datastore for {}".format( self.resource_name) ) # Actually done by the pipeline. else: print( "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False." ) clear_first = False elif wipe_data: if destination in ['ckan']: if datastore_exists(package_id, self.resource_name): print("Wiping records from the datastore for {}". format(self.resource_name)) else: print( "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False." ) wipe_data = False # Upload data to datastore print('Uploading tabular data...') try: curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=4000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line) \ .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath) \ .extract(self.extractor, firstline_headers=True) \ .schema(self.schema) \ .load(loader, self.loader_config_string, filepath = self.destination_file_path, file_format = file_format, fields = self.schema().serialize_to_ckan_fields(), key_fields = self.primary_key_fields, package_id = package_id, resource_name = self.resource_name, clear_first = clear_first, wipe_data = wipe_data, method = self.upload_method).run() except FileNotFoundError: if self.ignore_if_source_is_missing: print( "The source file for this job wasn't found, but that's not surprising." ) else: raise if destination in [ 'ckan', 'ckan_filestore', 'local_monthly_archive_zipped' ]: resource_id = find_resource_id( package_id, self.resource_name ) # This IS determined in the pipeline, so it would be nice if the pipeline would return it. locators_by_destination[destination] = resource_id elif destination in ['file']: locators_by_destination[ destination] = self.destination_file_path return locators_by_destination
def run_pipeline(self, clear_first, wipe_data, migrate_schema, retry_without_last_line=False, ignore_empty_rows=False): # target is a filepath which is actually the source filepath. # The retry_without_last_line option is a way of dealing with CSV files # that abruptly end mid-line. locators_by_destination = {} if self.destination == 'ckan_link': # Handle special case of just wanting to make a resource that is just a hyperlink # which really doesn't need a full pipeline at this point. from engine.credentials import site, API_key ckan = ckanapi.RemoteCKAN(site, apikey=API_key) resource_id = find_resource_id(self.package_id, self.resource_name) if resource_id is None: resource_as_dict = ckan.action.resource_create( package_id=self.package_id, url=self.source_full_url, format='HTML', name=self.resource_name) resource_id = resource_as_dict['id'] else: resource_as_dict = ckan.action.resource_update( id=resource_id, url=self.source_full_url, format='HTML', name=self.resource_name) locators_by_destination[self.destination] = resource_id return locators_by_destination source_file_format = self.source_file.split('.')[-1].lower() if self.destination_file is not None: self.destination_file_format = self.destination_file.split( '.')[-1].lower() else: self.destination_file_format = source_file_format # 2) While wprdc_etl uses 'CSV' as a # format that it sends to CKAN, I'm inclined to switch to 'csv', # and uniformly lowercase all file formats. # Though given a format of 'geojson', the CKAN API resource_create # response lists format as 'GeoJSON', so CKAN is doing some kind # of correction. # [ ] Maybe test_mode and any other parameters should be applied in a discrete stage between initialization and running. # This would allow the source and destination parameters to be prepared, leaving the pipeline running to just run the pipeline. # BEGIN Destination-specific configuration # A) First configure the loader. It might make more sense to move this to configure_pipeline_with_options(). if self.destination == 'ckan': self.loader = pl.CKANDatastoreLoader elif self.destination == 'file': # The tabularity of the data (that is, whether the loader is going # to be handed a record (list of dicts) or a file or file-like object # should determine which kind of file loader will be used. if self.destination_file_format is None: raise ValueError( "Destination == 'file' but self.destination_file_format is None!" ) elif self.destination_file_format.lower() in [ 'csv', 'json' ] and self.compressed_file_to_extract is None: self.loader = pl.TabularFileLoader # Isn't this actually very CSV-specific, given the write_or_append_to_csv_file function it uses? self.upload_method = 'insert' # Note that this will always append records to an existing file # unless 'always_clear_first' (or 'always_wipe_data') is set to True. else: self.loader = pl.NontabularFileLoader elif self.destination == 'ckan_filestore': self.loader = pl.CKANFilestoreLoader elif self.destination is None: return { } # locators_by_destination should be empty and the pipeline should be skipped # for these cases where there is no destination (like in snow_plow_geojson.py when # no new files to upload are found. else: raise ValueError( f"run_pipeline does not know how to handle destination = {self.destination}" ) # B) Then do some boolean operations on clear_first, self.always_clear_first, migrate_schema, and wipe_first. clear_first = clear_first or self.always_clear_first or migrate_schema # If migrate_schema == True, 1) backup the data dictionary, # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary. # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function. wipe_data = wipe_data or self.always_wipe_data if clear_first and wipe_data: raise ValueError( "clear_first and wipe_data should not both be True simultaneously. To clear a datastore for a job that has always_wipe_data = True, add the command-line argument 'override_wipe_data'." ) elif clear_first: if self.destination in ['ckan']: if datastore_exists(self.package_id, self.resource_name): # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level). print("Clearing the datastore for {}".format( self.resource_name)) # Actually done by the pipeline. else: print( "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False." ) clear_first = False elif wipe_data: if self.destination in ['ckan']: if datastore_exists(self.package_id, self.resource_name): print("Wiping records from the datastore for {}".format( self.resource_name)) else: print( "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False." ) wipe_data = False print( f'Loading {"tabular data" if self.loader.has_tabular_output else "file"}...' ) # END Destination-specific configuration try: curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line, ignore_empty_rows = ignore_empty_rows, filters = self.filters) \ .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath, verify_requests=self.verify_requests, fallback_host=self.source_site) \ .extract(self.extractor, firstline_headers=True, rows_to_skip=self.rows_to_skip, compressed_file_to_extract=self.compressed_file_to_extract) \ .schema(self.schema) \ .load(self.loader, self.loader_config_string, filepath = self.destination_file_path, file_format = self.destination_file_format, fields = self.schema().serialize_to_ckan_fields(), key_fields = self.primary_key_fields, package_id = self.package_id, resource_name = self.resource_name, clear_first = clear_first, wipe_data = wipe_data, method = self.upload_method).run() except FileNotFoundError: if self.ignore_if_source_is_missing: print( "The source file for this job wasn't found, but that's not surprising." ) else: raise if self.destination in ['ckan', 'ckan_filestore']: resource_id = find_resource_id( self.package_id, self.resource_name ) # This IS determined in the pipeline, so it would be nice if the pipeline would return it. locators_by_destination[self.destination] = resource_id elif self.destination in ['file']: locators_by_destination[ self.destination] = self.destination_file_path return locators_by_destination