def handle(self, *args, **options): elasticsearch_client = instantiate_elasticsearch_client() config = process_cli_parameters(options, elasticsearch_client) start = perf_counter() printf({"msg": f"Starting script\n{'=' * 56}"}) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**config)}) if config["load_type"] == "transactions": ensure_view_exists(settings.ES_TRANSACTIONS_ETL_VIEW_NAME) elif config["load_type"] == "awards": ensure_view_exists(settings.ES_AWARDS_ETL_VIEW_NAME) loader = Rapidloader(config, elasticsearch_client) loader.run_load_steps() loader.complete_process() printf({ "msg": "---------------------------------------------------------------" }) printf({"msg": f"Script completed in {perf_counter() - start:.2f}s"}) printf({ "msg": "---------------------------------------------------------------" })
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, baby_sleeps): elasticsearch_client = instantiate_elasticsearch_client() loader = Rapidloader(config, elasticsearch_client) assert loader.__class__.__name__ == "Rapidloader" loader.run_load_steps() assert elasticsearch_client.indices.exists(config["index_name"]) elasticsearch_client.indices.delete(index=config["index_name"], ignore_unavailable=False)
def handle(self, *args, **options): self.elasticsearch_client = instantiate_elasticsearch_client() self.config = process_cli_parameters(options, self.elasticsearch_client) start = perf_counter() printf({"msg": "Starting script\n{}".format("=" * 56)}) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**self.config)}) ensure_transaction_etl_view_exists() self.run_load_steps() self.complete_process() printf({ "msg": "---------------------------------------------------------------" }) printf({ "msg": "Script completed in {} seconds".format(perf_counter() - start) }) printf({ "msg": "---------------------------------------------------------------" })
def extract_transform_load(task: TaskSpec) -> None: if abort.is_set(): logger.warning(format_log(f"Skipping partition #{task.partition_number} due to previous error", name=task.name)) return start = perf_counter() msg = f"Started processing on partition #{task.partition_number}: {task.name}" logger.info(format_log(msg, name=task.name)) client = instantiate_elasticsearch_client() try: records = task.transform_func(task, extract_records(task)) if abort.is_set(): f"Prematurely ending partition #{task.partition_number} due to error in another process" logger.warning(format_log(msg, name=task.name)) return success, fail = load_data(task, records, client) with total_doc_success.get_lock(): total_doc_success.value += success with total_doc_fail.get_lock(): total_doc_fail.value += fail except Exception: if abort.is_set(): msg = f"Partition #{task.partition_number} failed after an error was previously encountered" logger.warning(format_log(msg, name=task.name)) else: logger.error(format_log(f"{task.name} failed!", name=task.name)) abort.set() else: msg = f"Partition #{task.partition_number} was successfully processed in {perf_counter() - start:.2f}s" logger.info(format_log(msg, name=task.name))
def prepare_for_etl(self) -> None: if self.config["process_deletes"]: self.run_deletes() logger.info(format_log("Assessing data to process")) self.record_count, self.min_id, self.max_id = count_of_records_to_process( self.config) if self.record_count == 0: self.processes = [] return self.config["partitions"] = self.determine_partitions() self.config["processes"] = min(self.config["processes"], self.config["partitions"]) self.tasks = self.construct_tasks() logger.info( format_log( f"Created {len(self.tasks):,} task partitions" f" to process {self.record_count:,} total {self.config['data_type']} records" f" from ID {self.min_id} to {self.max_id}" f" with {self.config['processes']:,} parallel processes")) if self.config["create_new_index"]: # ensure template for index is present and the latest version call_command("es_configure", "--template-only", f"--load-type={self.config['data_type']}") create_index(self.config["index_name"], instantiate_elasticsearch_client())
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, baby_sleeps, monkeypatch): monkeypatch.setattr("usaspending_api.etl.es_etl_helpers.execute_sql_statement", mock_execute_sql) elasticsearch_client = instantiate_elasticsearch_client() loader = Rapidloader(config, elasticsearch_client) assert loader.__class__.__name__ == "Rapidloader" loader.run_load_steps() assert elasticsearch_client.indices.exists(config["index_name"]) elasticsearch_client.indices.delete(index=config["index_name"], ignore_unavailable=False)
def run_deletes(self) -> None: logger.info(format_log("Processing deletions")) client = instantiate_elasticsearch_client() if self.config["data_type"] == "award": deleted_awards(client, self.config) elif self.config["data_type"] == "transaction": deleted_transactions(client, self.config) else: raise RuntimeError(f"No delete function implemented for type {self.config['data_type']}")
def test_es_transaction_loader_class(award_data_fixture, elasticsearch_transaction_index, monkeypatch): monkeypatch.setattr( "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement", mock_execute_sql) elasticsearch_client = instantiate_elasticsearch_client() loader = Controller(transaction_config, elasticsearch_client) assert loader.__class__.__name__ == "Controller" loader.run_load_steps() assert elasticsearch_client.indices.exists( transaction_config["index_name"]) elasticsearch_client.indices.delete(index=transaction_config["index_name"], ignore_unavailable=False)
def complete_process(self) -> None: client = instantiate_elasticsearch_client() if self.config["create_new_index"]: set_final_index_config(client, self.config["index_name"]) if self.config["skip_delete_index"]: logger.info(format_log("Skipping deletion of old indices")) else: logger.info(format_log("Closing old indices and adding aliases")) swap_aliases(client, self.config) if self.config["is_incremental_load"]: toggle_refresh_on(client, self.config["index_name"]) logger.info( format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load") ) update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
def handle(self, *args, **options): elasticsearch_client = instantiate_elasticsearch_client() config = parse_cli_args(options, elasticsearch_client) start = perf_counter() logger.info(format_log(f"Starting script\n{'=' * 56}")) start_msg = "target index: {index_name} | Starting from: {starting_date}" logger.info(format_log(start_msg.format(**config))) ensure_view_exists(config["sql_view"], force=True) error_addition = "" loader = Controller(config) if config["is_incremental_load"]: toggle_refresh_off(elasticsearch_client, config["index_name"]) # Turned back on at end. try: if config["process_deletes"]: loader.run_deletes() if not config["deletes_only"]: loader.prepare_for_etl() loader.dispatch_tasks() except Exception as e: logger.error(f"{str(e)}") error_addition = "before encountering a problem during execution.... " raise SystemExit(1) else: loader.complete_process() if config["drop_db_view"]: logger.info( format_log(f"Dropping SQL view '{config['sql_view']}'")) drop_etl_view(config["sql_view"], True) finally: msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|" headers = f"{'-' * (len(msg) - 2)} |" logger.info(format_log(headers)) logger.info(format_log(msg)) logger.info(format_log(headers)) # Used to help pipeline determine when job passed but needs attention if config["raise_status_code_3"]: raise SystemExit(3)
def handle(self, *args, **options): # Initialize client to connect to Elasticsearch es_client = instantiate_elasticsearch_client() # Open connection to database with connection.cursor() as cursor: # Queries for Covid Awards not present in latest File C Submission cursor.execute(MISSING_COVID_AWARD_SQL) logger.info( "Found {} Covid awards without entry in latest File C Submission" .format(cursor.rowcount)) rows = cursor.fetchmany(self.FETCH_COUNT) while len(rows) > 0: award_ids = [row[0] for row in rows] # Sets the outlays of these awards to zero in Elasticsearch self.set_elasticsearch_covid_outlays_to_zero( es_client, award_ids) rows = cursor.fetchmany(self.FETCH_COUNT)