def handle(self, *args, **options): elasticsearch_client = instantiate_elasticsearch_client() config = process_cli_parameters(options, elasticsearch_client) start = perf_counter() printf({"msg": f"Starting script\n{'=' * 56}"}) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**config)}) if config["load_type"] == "transactions": ensure_view_exists(settings.ES_TRANSACTIONS_ETL_VIEW_NAME) elif config["load_type"] == "awards": ensure_view_exists(settings.ES_AWARDS_ETL_VIEW_NAME) loader = Rapidloader(config, elasticsearch_client) loader.run_load_steps() loader.complete_process() printf({ "msg": "---------------------------------------------------------------" }) printf({"msg": f"Script completed in {perf_counter() - start:.2f}s"}) printf({ "msg": "---------------------------------------------------------------" })
def handle(self, *args, **options): self.elasticsearch_client = instantiate_elasticsearch_client() self.config = process_cli_parameters(options, self.elasticsearch_client) start = perf_counter() printf({"msg": "Starting script\n{}".format("=" * 56)}) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**self.config)}) ensure_transaction_etl_view_exists() self.run_load_steps() self.complete_process() printf({ "msg": "---------------------------------------------------------------" }) printf({ "msg": "Script completed in {} seconds".format(perf_counter() - start) }) printf({ "msg": "---------------------------------------------------------------" })
def handle(self, *args, **options): ''' Script execution of custom code starts in this method''' start = perf_counter() printf({'msg': 'Starting script\n{}'.format('=' * 56)}) self.config = set_config() self.config['verbose'] = True if options['verbosity'] > 1 else False self.config['fiscal_years'] = options['fiscal_years'] self.config['directory'] = options['dir'] + os.sep self.config['provide_deleted'] = options['deleted'] self.config['recreate'] = options['recreate'] self.config['stale'] = options['stale'] self.config['keep'] = options['keep'] if not options['since']: # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date # before the earliest records in S3 and when Postgres records were updated. # Choose the beginning of FY2008, and made it timezone-award for S3 self.config['starting_date'] = datetime.strptime( '2007-10-01+0000', '%Y-%m-%d%z') else: if self.config['recreate']: print( 'Bad mix of parameters! An index should not be dropped if only a subset of data will be loaded' ) raise SystemExit self.config['starting_date'] = datetime.strptime( options['since'] + '+0000', '%Y-%m-%d%z') if not os.path.isdir(self.config['directory']): printf({'msg': 'Provided directory does not exist'}) raise SystemExit self.controller() printf({ 'msg': '---------------------------------------------------------------' }) printf({ 'msg': 'Script completed in {} seconds'.format(perf_counter() - start) }) printf({ 'msg': '---------------------------------------------------------------' })
def run_load_steps(self) -> None: download_queue = Queue() # Queue for jobs which need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest updated_record_count = get_updated_record_count(self.config) printf({"msg": f"Found {updated_record_count:,} {self.config['load_type']} records to index"}) if updated_record_count == 0: jobs = 0 else: download_queue, jobs = self.create_download_jobs() printf({"msg": f"There are {jobs} jobs to process"}) process_list = [ Process( name="Download Process", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), ), Process( name="ES Index Process", target=es_data_loader, args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config), ), ] if updated_record_count != 0: # only run if there are data to process process_list[0].start() # Start Download process if self.config["process_deletes"]: process_list.append( Process( name="S3 Deleted Records Scrapper Process", target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards, args=(self.elasticsearch_client, self.config), ) ) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"}) sleep(7) # add a brief pause to make sure the deletes are processed in ES if updated_record_count != 0: process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit("Fatal error: review logs to determine why process died.") elif all([not x.is_alive() for x in process_list]): printf({"msg": "All ETL processes completed execution with no error codes"}) break
def transform_cli_arguments(self, options): simple_args = ("provide_deleted", "reload_all", "snapshot", "index_name", "directory", "fast") self.config = set_config(simple_args, options) self.config["fiscal_years"] = fiscal_years_for_processing(options) self.config["directory"] = self.config["directory"] + os.sep self.config["index_name"] = self.config["index_name"].lower() if self.config["reload_all"]: self.config["starting_date"] = DEFAULT_DATETIME elif options["start_datetime"]: self.config["starting_date"] = options["start_datetime"] else: # Due to the queries used for fetching postgres data, # `starting_date` needs to be present and a date before: # - The earliest records in S3. # - When all transaction records in the USAspending SQL database were updated. # And keep it timezone-award for S3 self.config["starting_date"] = get_last_load_date( "es_transactions", default=DEFAULT_DATETIME) self.config["mapping"], self.config["doc_type"], self.config[ "max_query_size"] = mapping_data_for_processing() does_index_exist = ES.indices.exists(self.config["index_name"]) self.config["is_incremental_load"] = self.config[ "starting_date"] != DEFAULT_DATETIME if not os.path.isdir(self.config["directory"]): printf({"msg": "Provided directory does not exist"}) raise SystemExit(1) elif self.config["starting_date"] < DEFAULT_DATETIME: printf({ "msg": "`start-datetime` is too early. Set to after {}".format( DEFAULT_DATETIME) }) raise SystemExit(1) elif does_index_exist and not self.config["is_incremental_load"]: printf({ "msg": "Full data load into existing index! Change destination index or load a subset of data" }) raise SystemExit(1) elif not does_index_exist or self.config["reload_all"]: printf({ "msg": "Skipping deletions for ths load, provide_deleted overwritten to False" }) self.config["provide_deleted"] = False
def run_load_steps(self) -> None: download_queue = Queue() # Queue for jobs which need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest job_number = 0 for fiscal_year in self.config["fiscal_years"]: job_number += 1 index = self.config["index_name"] filename = str( self.config["directory"] / "{fy}_{type}.csv".format(fy=fiscal_year, type=self.config["load_type"]) ) new_job = DataJob(job_number, index, fiscal_year, filename) if Path(filename).exists(): Path(filename).unlink() download_queue.put(new_job) printf({"msg": "There are {} jobs to process".format(job_number)}) process_list = [ Process( name="Download Process", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), ), Process( name="ES Index Process", target=es_data_loader, args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config), ), ] process_list[0].start() # Start Download process if self.config["process_deletes"]: process_list.append( Process( name="S3 Deleted Records Scrapper Process", target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards, args=(self.elasticsearch_client, self.config), ) ) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"}) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit("Fatal error: review logs to determine why process died.") elif all([not x.is_alive() for x in process_list]): printf({"msg": "All ETL processes completed execution with no error codes"}) break
def complete_process(self) -> None: if self.config["create_new_index"]: set_final_index_config(self.elasticsearch_client, self.config["index_name"]) if self.config["skip_delete_index"]: printf({"msg": "Skipping deletion of old indices"}) else: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: toggle_refresh_on(self.elasticsearch_client, self.config["index_name"]) printf({"msg": f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"}) update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])
def complete_process(self) -> None: if self.config["create_new_index"]: printf({"msg": "Closing old indices and adding aliases"}) set_final_index_config(self.elasticsearch_client, self.config["index_name"]) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: msg = "Storing datetime {} for next incremental load" printf({"msg": msg.format(self.config["processing_start_datetime"])}) update_last_load_date("es_transactions", self.config["processing_start_datetime"])
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_number = 0 for fy in self.config["fiscal_years"]: job_number += 1 index = self.config["index_name"] filename = "{dir}{fy}_transactions.csv".format( dir=self.config["directory"], fy=fy) new_job = DataJob(job_number, index, fy, filename) if os.path.exists(filename): os.remove(filename) download_queue.put(new_job) printf({"msg": "There are {} jobs to process".format(job_number)}) process_list = [] process_list.append( Process( name="Download Proccess", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), )) process_list.append( Process(name="ES Index Process", target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config["provide_deleted"]: process_list.append( Process(name="S3 Deleted Records Scrapper Process", target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ "msg": "Waiting to start ES ingest until S3 deletes are complete" }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ "msg": "All ETL processes completed execution with no error codes" }) break if self.config["reload_all"]: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(ES, self.config["index_name"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(ES, self.config["index_name"], settings.ES_REPOSITORY)
def handle(self, *args, **options): """ Script execution of custom code starts in this method""" start = perf_counter() printf({"msg": "Starting script\n{}".format("=" * 56)}) self.transform_cli_arguments(options) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**self.config)}) self.controller() if self.config["is_incremental_load"]: printf({ "msg": "Updating Last Load record with {}".format( self.config["processing_start_datetime"]) }) update_last_load_date("es_transactions", self.config["processing_start_datetime"]) printf({ "msg": "---------------------------------------------------------------" }) printf({ "msg": "Script completed in {} seconds".format(perf_counter() - start) }) printf({ "msg": "---------------------------------------------------------------" })
def handle(self, *args, **options): ''' Script execution of custom code starts in this method''' start = perf_counter() printf({'msg': 'Starting script\n{}'.format('=' * 56)}) self.config = set_config() self.config['verbose'] = True if options['verbosity'] > 1 else False self.config['fiscal_years'] = options['fiscal_years'] self.config['directory'] = options['dir'] + os.sep self.config['provide_deleted'] = options['deleted'] self.config['stale'] = options['stale'] self.config['swap'] = options['swap'] self.config['keep'] = options['keep'] self.config['snapshot'] = options['snapshot'] self.config['index_name'] = options['index_name'] mappingfile = os.path.join( settings.BASE_DIR, 'usaspending_api/etl/es_transaction_mapping.json') with open(mappingfile) as f: mapping_dict = json.load(f) self.config['mapping'] = json.dumps(mapping_dict) self.config['doc_type'] = str(list(mapping_dict['mappings'].keys())[0]) self.config['max_query_size'] = mapping_dict['settings'][ 'index.max_result_window'] does_index_exist = ES.indices.exists(self.config['index_name']) if not does_index_exist: printf({ 'msg': '"{}" does not exist, skipping deletions for ths load,\ provide_deleted overwritten to False'.format( self.config['index_name']) }) self.config['provide_deleted'] = False if not options['since']: if not options['days']: # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date # before the earliest records in S3 and when Postgres records were updated. # Choose the beginning of FY2008, and made it timezone-award for S3 self.config['starting_date'] = datetime.strptime( '2007-10-01+0000', '%Y-%m-%d%z') else: # If --days is provided, go back X days into the past self.config['starting_date'] = datetime.now( timezone.utc) - timedelta(days=options['days']) else: self.config['starting_date'] = datetime.strptime( options['since'] + '+0000', '%Y-%m-%d%z') if not os.path.isdir(self.config['directory']): printf({'msg': 'Provided directory does not exist'}) raise SystemExit if does_index_exist and (not options['since'] and not options['days']): print(''' Bad mix of parameters! Index exists and full data load implied. Choose a different index_name or load a subset of data using --since ''') raise SystemExit self.controller() printf({ 'msg': '---------------------------------------------------------------' }) printf({ 'msg': 'Script completed in {} seconds'.format(perf_counter() - start) }) printf({ 'msg': '---------------------------------------------------------------' })
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append( Process(name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append( Process(name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append( Process(name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ 'msg': 'All ETL processes completed execution with no error codes' }) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)
def process_cli_parameters(options: dict, es_client) -> dict: default_datetime = datetime.strptime( f"{settings.API_SEARCH_MIN_DATE}+0000", "%Y-%m-%d%z") simple_args = ( "skip_delete_index", "process_deletes", "create_new_index", "snapshot", "index_name", "directory", "skip_counts", "load_type", ) config = set_config(simple_args, options) config["fiscal_years"] = fiscal_years_for_processing(options) config["directory"] = Path(config["directory"]).resolve() if config["create_new_index"] and not config["index_name"]: raise SystemExit( "Fatal error: --create-new-index requires --index-name.") elif config["create_new_index"]: config["index_name"] = config["index_name"].lower() config["starting_date"] = default_datetime check_new_index_name_is_ok( config["index_name"], settings.ES_AWARDS_NAME_SUFFIX if config["load_type"] == "awards" else settings.ES_TRANSACTIONS_NAME_SUFFIX, ) elif options["start_datetime"]: config["starting_date"] = options["start_datetime"] else: # Due to the queries used for fetching postgres data, # `starting_date` needs to be present and a date before: # - The earliest records in S3. # - When all transaction records in the USAspending SQL database were updated. # And keep it timezone-award for S3 config["starting_date"] = get_last_load_date( f"es_{options['load_type']}", default=default_datetime) config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW if options["load_type"] == "awards": config["max_query_size"] = settings.ES_AWARDS_MAX_RESULT_WINDOW config["is_incremental_load"] = not bool(config["create_new_index"]) and ( config["starting_date"] != default_datetime) if config["is_incremental_load"]: write_alias = settings.ES_TRANSACTIONS_WRITE_ALIAS if config["load_type"] == "awards": write_alias = settings.ES_AWARDS_WRITE_ALIAS if config["index_name"]: printf({ "msg": f"Ignoring provided index name, using alias '{write_alias}' for incremental load" }) config["index_name"] = write_alias if not es_client.cat.aliases(name=write_alias): printf({ "msg": f"Fatal error: write alias '{write_alias}' is missing" }) raise SystemExit(1) # Force manual refresh for atomic transaction-like delete/re-add consistency during incremental load. # Turned back on at end. toggle_refresh_off(es_client, config["index_name"]) else: if es_client.indices.exists(config["index_name"]): printf({ "msg": "Fatal error: data load into existing index. Change index name or run an incremental load" }) raise SystemExit(1) if not config["directory"].is_dir(): printf({"msg": "Fatal error: provided directory does not exist"}) raise SystemExit(1) elif config["starting_date"] < default_datetime: printf({ "msg": f"Fatal error: --start-datetime is too early. Set no earlier than {default_datetime}" }) raise SystemExit(1) elif not config["is_incremental_load"] and config["process_deletes"]: printf({ "msg": "Skipping deletions for ths load, --deleted overwritten to False" }) config["process_deletes"] = False config["ingest_wait"] = options["idle_wait_time"] return config
def process_cli_parameters(options: dict, es_client) -> None: default_datetime = datetime.strptime( "{}+0000".format(settings.API_SEARCH_MIN_DATE), "%Y-%m-%d%z") simple_args = ( "process_deletes", "create_new_index", "snapshot", "index_name", "directory", "skip_counts", "skip_delete_index", ) config = set_config(simple_args, options) config["fiscal_years"] = fiscal_years_for_processing(options) config["directory"] = Path(config["directory"]).resolve() if config["create_new_index"] and not config["index_name"]: raise SystemExit( "Fatal error: --create-new-index requires --index-name.") elif config["create_new_index"]: config["index_name"] = config["index_name"].lower() config["starting_date"] = default_datetime check_new_index_name_is_ok(config["index_name"]) elif options["start_datetime"]: config["starting_date"] = options["start_datetime"] else: # Due to the queries used for fetching postgres data, # `starting_date` needs to be present and a date before: # - The earliest records in S3. # - When all transaction records in the USAspending SQL database were updated. # And keep it timezone-award for S3 config["starting_date"] = get_last_load_date("es_transactions", default=default_datetime) config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW config["is_incremental_load"] = not bool(config["create_new_index"]) and ( config["starting_date"] != default_datetime) if config["is_incremental_load"]: if config["index_name"]: msg = "Ignoring provided index name, using alias '{}' for incremental load" printf({"msg": msg.format(settings.ES_TRANSACTIONS_WRITE_ALIAS)}) config["index_name"] = settings.ES_TRANSACTIONS_WRITE_ALIAS if not es_client.cat.aliases( name=settings.ES_TRANSACTIONS_WRITE_ALIAS): printf({ "msg": "Fatal error: write alias '{}' is missing".format( settings.ES_TRANSACTIONS_WRITE_ALIAS) }) raise SystemExit(1) else: if es_client.indices.exists(config["index_name"]): printf({ "msg": "Fatal error: data load into existing index. Change index name or run an incremental load" }) raise SystemExit(1) if not config["directory"].is_dir(): printf({"msg": "Fatal error: provided directory does not exist"}) raise SystemExit(1) elif config["starting_date"] < default_datetime: printf({ "msg": "Fatal error: --start-datetime is too early. Set no earlier than {}" .format(default_datetime) }) raise SystemExit(1) elif not config["is_incremental_load"] and config["process_deletes"]: printf({ "msg": "Skipping deletions for ths load, --deleted overwritten to False" }) config["process_deletes"] = False return config
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 10) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 award_category = AWARD_DESC_CATEGORIES[awd_cat_idx] index = '{}-{}-{}'.format(settings.TRANSACTIONS_INDEX_ROOT, award_category, fy) filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = csv_row_count(filename) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) if self.config['provide_deleted']: s3_delete_process = Process(target=deleted_transactions, args=(ES, self.config)) download_proccess = Process(target=download_db_records, args=(download_queue, es_ingest_queue, self.config)) es_index_process = Process(target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config)) download_proccess.start() if self.config['provide_deleted']: s3_delete_process.start() while s3_delete_process.is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) es_index_process.start() if self.config['provide_deleted']: s3_delete_process.join() download_proccess.join() es_index_process.join()
def handle(self, *args, **options): ''' Script execution of custom code starts in this method''' start = perf_counter() printf({'msg': 'Starting script\n{}'.format('=' * 56)}) self.config = set_config() self.config['verbose'] = True if options['verbosity'] > 1 else False self.config['fiscal_years'] = options['fiscal_years'] self.config['directory'] = options['dir'] + os.sep self.config['provide_deleted'] = options['deleted'] self.config['stale'] = options['stale'] self.config['swap'] = options['swap'] self.config['keep'] = options['keep'] self.config['snapshot'] = options['snapshot'] self.config['index_name'] = options['index_name'] mappingfile = os.path.join(settings.BASE_DIR, 'usaspending_api/etl/es_transaction_mapping.json') with open(mappingfile) as f: mapping_dict = json.load(f) self.config['mapping'] = json.dumps(mapping_dict) self.config['doc_type'] = str(list(mapping_dict['mappings'].keys())[0]) self.config['max_query_size'] = mapping_dict['settings']['index.max_result_window'] does_index_exist = ES.indices.exists(self.config['index_name']) if not does_index_exist: printf({'msg': '"{}" does not exist, skipping deletions for ths load,\ provide_deleted overwritten to False'.format(self.config['index_name'])}) self.config['provide_deleted'] = False if not options['since']: if not options['days']: # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date # before the earliest records in S3 and when Postgres records were updated. # Choose the beginning of FY2008, and made it timezone-award for S3 self.config['starting_date'] = datetime.strptime('2007-10-01+0000', '%Y-%m-%d%z') else: # If --days is provided, go back X days into the past self.config['starting_date'] = datetime.now(timezone.utc) - timedelta(days=options['days']) else: self.config['starting_date'] = datetime.strptime(options['since'] + '+0000', '%Y-%m-%d%z') if not os.path.isdir(self.config['directory']): printf({'msg': 'Provided directory does not exist'}) raise SystemExit if does_index_exist and (not options['since'] and not options['days']): print(''' Bad mix of parameters! Index exists and full data load implied. Choose a different index_name or load a subset of data using --since ''') raise SystemExit self.controller() printf({'msg': '---------------------------------------------------------------'}) printf({'msg': 'Script completed in {} seconds'.format(perf_counter() - start)}) printf({'msg': '---------------------------------------------------------------'})
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count), 'job': new_job.name, 'f': 'Download'}) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append(Process( name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append(Process( name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append(Process( name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'}) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({'msg': 'All ETL processes completed execution with no error codes'}) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)