def prepare_for_etl(self) -> None: if self.config["process_deletes"]: self.run_deletes() logger.info(format_log("Assessing data to process")) self.record_count, self.min_id, self.max_id = count_of_records_to_process( self.config) if self.record_count == 0: self.processes = [] return self.config["partitions"] = self.determine_partitions() self.config["processes"] = min(self.config["processes"], self.config["partitions"]) self.tasks = self.construct_tasks() logger.info( format_log( f"Created {len(self.tasks):,} task partitions" f" to process {self.record_count:,} total {self.config['data_type']} records" f" from ID {self.min_id} to {self.max_id}" f" with {self.config['processes']:,} parallel processes")) if self.config["create_new_index"]: # ensure template for index is present and the latest version call_command("es_configure", "--template-only", f"--load-type={self.config['data_type']}") create_index(self.config["index_name"], instantiate_elasticsearch_client())
def parse_cli_args(options: dict, es_client) -> dict: passthrough_values = ( "create_new_index", "drop_db_view", "index_name", "load_type", "partition_size", "process_deletes", "processes", "skip_counts", "skip_delete_index", ) config = set_config(passthrough_values, options) if config["create_new_index"] and not config["index_name"]: raise SystemExit( "Fatal error: '--create-new-index' requires '--index-name'.") elif config["create_new_index"]: config["index_name"] = config["index_name"].lower() config["starting_date"] = config["initial_datetime"] check_new_index_name_is_ok(config["index_name"], config["required_index_name"]) elif options["start_datetime"]: config["starting_date"] = options["start_datetime"] else: # Due to the queries used for fetching postgres data, # `starting_date` needs to be present and a date before: # - The earliest records in S3. # - When all transaction records in the USAspending SQL database were updated. # And keep it timezone-aware for S3 config["starting_date"] = get_last_load_date( config["stored_date_key"], default=config["initial_datetime"]) config["is_incremental_load"] = not bool(config["create_new_index"]) and ( config["starting_date"] != config["initial_datetime"]) if config["is_incremental_load"]: if config["index_name"]: logger.info( format_log( f"Ignoring provided index name, using alias '{config['write_alias']}' for safety" )) config["index_name"] = config["write_alias"] if not es_client.cat.aliases(name=config["write_alias"]): logger.error(f"Write alias '{config['write_alias']}' is missing") raise SystemExit(1) else: if es_client.indices.exists(config["index_name"]): logger.error( f"Data load into existing index. Change index name or run an incremental load" ) raise SystemExit(1) if config["starting_date"] < config["initial_datetime"]: logger.error( f"--start-datetime is too early. Set no earlier than {config['initial_datetime']}" ) raise SystemExit(1) return config
def complete_process(self) -> None: client = instantiate_elasticsearch_client() if self.config["create_new_index"]: set_final_index_config(client, self.config["index_name"]) if self.config["skip_delete_index"]: logger.info(format_log("Skipping deletion of old indices")) else: logger.info(format_log("Closing old indices and adding aliases")) swap_aliases(client, self.config) if self.config["is_incremental_load"]: toggle_refresh_on(client, self.config["index_name"]) logger.info( format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load") ) update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
def run_deletes(self) -> None: logger.info(format_log("Processing deletions")) client = instantiate_elasticsearch_client() if self.config["data_type"] == "award": deleted_awards(client, self.config) elif self.config["data_type"] == "transaction": deleted_transactions(client, self.config) else: raise RuntimeError(f"No delete function implemented for type {self.config['data_type']}")
def dispatch_tasks(self) -> None: _abort = Event() # Event which when set signals an error occured in a subprocess parellel_procs = self.config["processes"] with Pool(parellel_procs, maxtasksperchild=1, initializer=init_shared_abort, initargs=(_abort,)) as pool: pool.map(extract_transform_load, self.tasks) msg = f"Total documents indexed: {total_doc_success.value}, total document fails: {total_doc_fail.value}" logger.info(format_log(msg)) if _abort.is_set(): raise RuntimeError("One or more partitions failed!")
def extract_transform_load(task: TaskSpec) -> None: if abort.is_set(): logger.warning(format_log(f"Skipping partition #{task.partition_number} due to previous error", name=task.name)) return start = perf_counter() msg = f"Started processing on partition #{task.partition_number}: {task.name}" logger.info(format_log(msg, name=task.name)) client = instantiate_elasticsearch_client() try: records = task.transform_func(task, extract_records(task)) if abort.is_set(): f"Prematurely ending partition #{task.partition_number} due to error in another process" logger.warning(format_log(msg, name=task.name)) return success, fail = load_data(task, records, client) with total_doc_success.get_lock(): total_doc_success.value += success with total_doc_fail.get_lock(): total_doc_fail.value += fail except Exception: if abort.is_set(): msg = f"Partition #{task.partition_number} failed after an error was previously encountered" logger.warning(format_log(msg, name=task.name)) else: logger.error(format_log(f"{task.name} failed!", name=task.name)) abort.set() else: msg = f"Partition #{task.partition_number} was successfully processed in {perf_counter() - start:.2f}s" logger.info(format_log(msg, name=task.name))
def handle(self, *args, **options): elasticsearch_client = instantiate_elasticsearch_client() config = parse_cli_args(options, elasticsearch_client) start = perf_counter() logger.info(format_log(f"Starting script\n{'=' * 56}")) start_msg = "target index: {index_name} | Starting from: {starting_date}" logger.info(format_log(start_msg.format(**config))) ensure_view_exists(config["sql_view"], force=True) error_addition = "" loader = Controller(config) if config["is_incremental_load"]: toggle_refresh_off(elasticsearch_client, config["index_name"]) # Turned back on at end. try: if config["process_deletes"]: loader.run_deletes() if not config["deletes_only"]: loader.prepare_for_etl() loader.dispatch_tasks() except Exception as e: logger.error(f"{str(e)}") error_addition = "before encountering a problem during execution.... " raise SystemExit(1) else: loader.complete_process() if config["drop_db_view"]: logger.info( format_log(f"Dropping SQL view '{config['sql_view']}'")) drop_etl_view(config["sql_view"], True) finally: msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|" headers = f"{'-' * (len(msg) - 2)} |" logger.info(format_log(headers)) logger.info(format_log(msg)) logger.info(format_log(headers)) # Used to help pipeline determine when job passed but needs attention if config["raise_status_code_3"]: raise SystemExit(3)