def handle(self, *args, **options): try: with Timer("Loading TAS from {}".format(options["location"] or "Broker")): if options["location"]: self.csv_tas_loader(options["location"]) else: call_command("run_sql", "-f", TAS_SQL_PATH) # Update TAS agency links. with Timer("Updating TAS agencies"): count = update_treasury_appropriation_account_agencies() logger.info(f" Updated {count:,} TAS agency links") # Update Federal Accounts from TAS. with Timer("Updating Federal Accounts from TAS"): deletes = remove_empty_federal_accounts() logger.info(f" Removed {deletes:,} Federal Account Rows") updates = update_federal_accounts() logger.info(f" Updated {updates:,} Federal Account Rows") inserts = insert_federal_accounts() logger.info(f" Created {inserts:,} Federal Account Rows") links = link_treasury_accounts_to_federal_accounts() logger.info(f" Linked {links:,} Treasury Accounts to Federal Accounts") agencies = update_federal_account_agency() logger.info(f" Updated {agencies:,} Federal Account agency links") logger.info("=== TAS loader finished successfully! ===") except Exception as e: logger.error(e) logger.error("=== TAS loader failed ===") sys.exit(1)
def run_script(self, *args, **options): self.start_time = datetime.now(timezone.utc) self.options = options if self.options["incremental_date"]: self.is_incremental = True self.options["datetime"] = self.obtain_last_date() if self.options["process_deletes"]: delete_date = self.options["datetime"] if not delete_date: delete_date = self.begining_of_time with Timer(message="Processing deletes"): delete_job_status = call_command( self.delete_management_command, f"--date={delete_date}") if delete_job_status != 0: raise RuntimeError("Fatal error. Problem with the deletes") try: with Timer(message="Load Process"): self.process() self.successful_run = True except (Exception, SystemExit, KeyboardInterrupt): logger.exception("Fatal error") finally: self.cleanup()
def process(self) -> None: with Timer(message="Compiling IDs to process", success_logger=logger.info, failure_logger=logger.error): self.file_path, self.total_ids_to_process = self.compile_transactions_to_process() logger.info(f"{self.total_ids_to_process:,} IDs stored") with Timer(message="Transfering Data", success_logger=logger.info, failure_logger=logger.error): self.copy_broker_table_data(self.broker_source_table_name, self.destination_table_name, self.shared_pk)
def _perform_load(self): self._create_staging_table() self._import_input_file() natural_key_columns = ["feature_id", "state_alpha", "county_sequence", "county_numeric"] destination_table = ETLTable(self.destination_table_name, key_overrides=natural_key_columns) source_table = ETLTemporaryTable(self.staging_table_name) with Timer("Delete obsolete rows"): deleted_count = operations.delete_obsolete_rows(source_table, destination_table) logger.info(f"{deleted_count:,} rows deleted") with Timer("Update changed rows"): updated_count = operations.update_changed_rows(source_table, destination_table) logger.info(f"{updated_count:,} rows updated") with Timer("Insert missing rows"): inserted_count = operations.insert_missing_rows(source_table, destination_table) logger.info(f"{inserted_count:,} rows inserted") with Timer("Remove staging table"): execute_dml_sql(f'drop table if exists "{self.staging_table_name}"') change_count = deleted_count + updated_count + inserted_count current_record_count = CityCountyStateCode.objects.count() max_change_threshold = int(current_record_count * MAX_CHANGE_PERCENT / 100.0) if current_record_count > 0 and change_count > max_change_threshold and not self.force: raise RuntimeError( f"Exceeded maximum number of allowed changes {max_change_threshold:,} ({MAX_CHANGE_PERCENT}%). " f"Use --force switch if this was intentional." ) return change_count
def handle(self, *args, **options): self.agency_file = options["agency_file"] self.force = options["force"] logger.info(f"AGENCY FILE: {self.agency_file}") logger.info(f"FORCE SWITCH: {self.force}") logger.info( f"MAX CHANGE LIMIT: {'unlimited' if self.force else f'{MAX_CHANGES:,}'}" ) with Timer("Load agencies"): try: with transaction.atomic(): self._perform_load() t = Timer("Commit agency transaction") t.log_starting_message() t.log_success_message() except Exception: logger.error("ALL CHANGES ROLLED BACK DUE TO EXCEPTION") raise try: self._vacuum_tables() except Exception: logger.error( "CHANGES WERE SUCCESSFULLY COMMITTED EVEN THOUGH VACUUMS FAILED" ) raise
def handle(self, *args, **options): self.object_class_file = options["object_class_file"] logger.info("OBJECT CLASS FILE: {}".format(self.object_class_file)) with Timer("Load object classes"): try: with transaction.atomic(): self._perform_load() t = Timer("Commit transaction") t.log_starting_message() t.log_success_message() except Exception: logger.error("ALL CHANGES ROLLED BACK DUE TO EXCEPTION") raise try: self._vacuum_tables() except Exception: logger.error( "CHANGES WERE SUCCESSFULLY COMMITTED EVEN THOUGH VACUUMS FAILED" ) raise
def create_views(self): loop = asyncio.new_event_loop() tasks = [] # Create Matviews for matview, config in self.matviews.items(): logger.info(f"Creating Future for matview {matview}") sql = (self.matview_dir / config["sql_filename"]).read_text() tasks.append(asyncio.ensure_future(async_run_creates(sql, wrapper=Timer(matview)), loop=loop)) # Create Chunked Matviews for matview, config in self.chunked_matviews.items(): for current_chunk in range(self.chunk_count): chunked_matview = f"{matview}_{current_chunk}" logger.info(f"Creating Future for chunked matview {chunked_matview}") sql = (self.matview_chunked_dir / f"{chunked_matview}.sql").read_text() tasks.append(asyncio.ensure_future(async_run_creates(sql, wrapper=Timer(chunked_matview)), loop=loop)) loop.run_until_complete(asyncio.gather(*tasks)) loop.close() if "universal_transaction_matview" in self.chunked_matviews: logger.info("Inserting data from universal_transaction_matview chunks into single table.") call_command( "combine_universal_transaction_matview_chunks", chunk_count=self.chunk_count, index_concurrency=20, matview_dir=self.matview_chunked_dir, ) for view in OVERLAY_VIEWS: run_sql(view.read_text(), "Creating Views") if self.remove_matviews: run_sql(DROP_OLD_MATVIEWS.read_text(), "Drop Old Materialized Views")
def perform_load(self): with connection.cursor() as cursor: with Timer("Create temporary tables"): cursor.execute(CREATE_AND_PREP_TEMP_TABLES) for temp_table in TEMP_TABLE_CONTENTS: self.populate_temp_table(cursor, temp_table) with Timer(f"Reload '{OVERVIEW_TABLE_NAME}'"): cursor.execute(CREATE_OVERVIEW_SQL)
def handle(self, *args, **options): with Timer("Refresh Reporting Agency Overview"): try: with transaction.atomic(): self.perform_load() t = Timer("Commit transaction") t.log_starting_message() t.log_success_message() except Exception: logger.error("ALL CHANGES ROLLED BACK DUE TO EXCEPTION") raise
def handle(self, *args, **options): try: with Timer("Loading TAS from {}".format(options["location"] or "Broker")): if options["location"]: self.csv_tas_loader(options["location"]) else: call_command("run_sql", "-f", TAS_SQL_PATH) # Match funding toptiers by FREC if they didn't match by AID with Timer( "Matching Funding Toptier Agency where AID didn't match"): unmapped_funding_agencies = TreasuryAppropriationAccount.objects.filter( funding_toptier_agency=None) match_count = 0 msg_str = "=== Found {} unmatched funding agencies across all TAS objects. ===" logger.info(msg_str.format(unmapped_funding_agencies.count())) for next_tas in unmapped_funding_agencies: frec_match = ToptierAgency.objects.filter( toptier_code=next_tas.fr_entity_code).first() if frec_match: match_count += 1 logger.info( " Matched unknown funding agency for TAS {} with FREC {}" .format(next_tas.tas_rendering_label, next_tas.fr_entity_code)) next_tas.funding_toptier_agency = frec_match next_tas.save() logger.info( "=== Updated {} Funding Toptier Agencies with a FREC agency. ===" .format(match_count)) # update TAS fk relationships to federal accounts with Timer("Updated TAS FK relationships to Federal Accounts"): logger.info( "=== Updating TAS FK relationships to Federal Accounts ===" ) updates = update_federal_accounts() logger.info( " Updated {} Federal Account Rows".format(updates)) inserts = insert_federal_accounts() logger.info( " Created {} Federal Account Rows".format(inserts)) deletes = remove_empty_federal_accounts() logger.info( " Removed {} Federal Account Rows".format(deletes)) logger.info("=== TAS loader finished successfully! ===") except Exception as e: logger.error(e) logger.error("=== TAS loader failed ===") sys.exit(1)
def _perform_load(self, where): """ The actual heavy lifting. Call the SQLs necessary to load subawards. """ if not self.full_reload: self._execute_sql_file("010_find_new_awards.sql") with transaction.atomic(): if self.full_reload: with Timer("delete from broker_subaward"): self._execute_sql("delete from broker_subaward") self._execute_sql_file("020_import_broker_subawards.sql", where=where) self._execute_sql_file("030_frame_out_subawards.sql") self._execute_sql_file("040_enhance_with_awards_data.sql") self._execute_sql_file("050_enhance_with_transaction_data.sql") self._execute_sql_file("060_enhance_with_cfda_data.sql") self._execute_sql_file("070_enhance_with_awarding_agency.sql") self._execute_sql_file("080_enhance_with_funding_agency.sql") self._execute_sql_file("090_create_temp_address_table.sql") self._execute_sql_file("100_enhance_with_pop_county_city.sql") self._execute_sql_file("110_enhance_with_pop_country.sql") self._execute_sql_file( "120_enhance_with_recipient_location_county_city.sql") self._execute_sql_file( "130_enhance_with_recipient_location_country.sql") # For update_award_totals, if we're performing a full reload, update # all awards that have subawards, otherwise, if this is an incremental # update, just update those awards that have may have been affected. if self.full_reload: where = "where award_id is not null" else: where = "where award_id in (select award_id from temp_load_subawards_subaward where award_id is not null)" with transaction.atomic(): if self.full_reload: with Timer("delete from subaward"): self._execute_sql("delete from subaward") self._execute_sql_file("140_add_subawards.sql") if self.full_reload: self._execute_sql_file( "150_reset_unlinked_award_subaward_fields.sql") self._execute_sql_file("160_update_award_subaward_fields.sql", where=where) # We'll do this outside of the transaction since it doesn't hurt # anything to reimport subawards. self._execute_sql_file("170_mark_imported.sql") self._execute_sql_file("900_cleanup.sql")
def handle(self, *args, **options): self.full_reload = options["full_reload"] logger.info("FULL RELOAD SWITCH: {}".format(self.full_reload)) with Timer("Load subawards"): try: with transaction.atomic(): self._perform_load() t = Timer("Commit subaward transaction") t.log_starting_message() t.log_success_message() except Exception: logger.error("ALL CHANGES ROLLED BACK DUE TO EXCEPTION") raise
def handle(self, *args, **options): # Pick out just the last bit of the module name (which should match the # file name minus the extension). module_name = __name__.split(".")[-1] with Timer(module_name): agency_file = options["agency_file"] with Timer("Retrieve new CGACs"): new_cgacs = self._get_new_cgacs(agency_file) logger.info("{:,} CGACs read from '{}'.".format( len(new_cgacs), agency_file)) if len(new_cgacs) < 1: raise RuntimeError( "No CGAC values were found in '{}'.".format(agency_file)) if options["force"]: logger.info( "--force switch provided. Performing a full load.") else: with Timer("Retrieve old CGACs"): old_cgacs = self._get_old_cgacs() logger.info("{:,} CGACs read from cgac table".format( len(old_cgacs))) if not old_cgacs: logger.info( "No CGACs in cgac table. Performing a full load.") else: diff = self._diff_cgacs(new_cgacs, old_cgacs, agency_file) if diff > 0: logger.info( "Found {:,} differences. Performing a full load.". format(diff)) else: logger.info( "No differences found. No load will be performed." ) return with Timer("Load new CGACs"): self._load_cgacs(new_cgacs) logger.info("{:,} CGACs loaded.".format(len(new_cgacs)))
def handle(self, *args, **options): self.failed_state = False if options["all_time"]: with Timer("Processing all available time periods"): self.run_for_all_fiscal_years() elif options["ids"]: with Timer("Processing list of submissions"): self.run_specific_submissions(options["ids"]) else: logger.error("Nothing to do! use --all-time or --ids") return if self.failed_state: raise SystemExit("Submission errored. Returning failure exit code") logger.info("Completed Successfully")
def part_one(self): with Timer("Part 1: Addressing Missing Award Records"): with connection.cursor() as cursor: cursor.execute(CHECK_MISSING_AWARDS.format(table="transaction_fabs", field="afa_generated_unique")) fabs_to_load = [str(row[0]) for row in cursor.fetchall()] self.metrics["count_of_fabs_to_reload"] = len(fabs_to_load) cursor.execute( CHECK_MISSING_AWARDS.format(table="transaction_fpds", field="detached_award_procurement_id") ) fpds_to_load = [str(row[0]) for row in cursor.fetchall()] self.metrics["count_of_fpds_to_reload"] = len(fpds_to_load) if fabs_to_load: logger.warning(f"NEED TO LOAD {len(fabs_to_load)} transactions from Broker!") logger.info( "This script will not handle this reload. " "Store this IDs in a file for fabs_nightly_loader --afa-id-file:" f"\n{' '.join(fabs_to_load)}" "\n (note, check case of -none- vs -NONE-" ) else: logger.info("No FABS transactions need to be reloaded") if fpds_to_load: logger.info(f"Reloading {len(fpds_to_load)} FPDS transactions from Broker\n{' '.join(fpds_to_load)}") call_command("load_fpds_transactions", "--ids", *fpds_to_load) else: logger.info("No FPDS transactions need to be reloaded")
def handle(self, *args, **options): # Pick out just the last bit of the module name (which should match the # file name minus the extension). module_name = __name__.split(".")[-1] with Timer(module_name): self.full_reload = options["full_reload"] self.log_sql = options["sql"] if self.full_reload: logger.info("--full-reload flag supplied. Performing a full reload.") where = '' else: max_id, max_created_at, max_updated_at = self._get_maxes() if max_id is None and max_created_at is None and max_updated_at is None: logger.info("USAspending's broker_subaward table is empty. Promoting to full reload.") self.full_reload = True where = '' else: logger.info( "Performing an incremental load on USAspending's broker_subaward " "where id > {}, created_at > {}, or " "updated_at > {}".format(max_id, max_created_at, max_updated_at) ) where = self._build_where(id=max_id, created_at=max_created_at, updated_at=max_updated_at) self._perform_load(where)
def populate_temp_table(self, cursor: connection.cursor, temp_table: TempTableName) -> None: sql_template = "INSERT INTO {0} SELECT * FROM ({1}) AS {0}_contents;" with Timer(f"Populate '{temp_table.value}'"): cursor.execute( sql_template.format(temp_table.value, TEMP_TABLE_CONTENTS[temp_table]))
def handle(self, *args, **options): with Timer("Backdelete undeleted FPDS transactions"): filepath = str(Path(__file__).resolve().parent / "backdelete_missing_fpds_deletes.txt") with open(filepath) as f: offending_ids = tuple(int(id_) for id_ in f if id_.strip().isnumeric()) logger.info(f"{len(offending_ids):,} detached_award_procurement_ids found in {filepath}") match_count = get_matching_count("data_broker", "detached_award_procurement", offending_ids) if match_count > 0: raise RuntimeError( f"Found {match_count:,} detached_award_procurement_ids in Broker's detached_award_procurement" ) logger.info("Confirmed that none of the detached_award_procurement_ids exist in Broker") # Structure necessary for deletes. delete_ids = {"2020-04-27": offending_ids} with transaction.atomic(): match_count = get_matching_count(DEFAULT_DB_ALIAS, "source_procurement_transaction", offending_ids) logger.info(f"Found {match_count:,} detached_award_procurement_ids in source_procurement_transaction") source_command = SourceCommand() source_command.delete_rows(delete_ids) match_count = get_matching_count(DEFAULT_DB_ALIAS, "transaction_fpds", offending_ids) logger.info(f"Found {match_count:,} detached_award_procurement_ids in transaction_fpds") fpds_command = FPDSCommand() stale_awards = delete_stale_fpds(delete_ids) fpds_command.update_award_records(awards=stale_awards, skip_cd_linkage=False)
def handle(self, *args, **options): with Timer("Refresh Reporting Agency Overview"): try: self.perform_load() except Exception: logger.error("ALL CHANGES ROLLED BACK DUE TO EXCEPTION") raise
def _create_staging_table(self): with Timer("Create temporary staging table"): execute_dml_sql(f'drop table if exists "{self.staging_table_name}"') execute_dml_sql( f'create temporary table "{self.staging_table_name}" (like "{self.destination_table_name}" including all)' ) execute_dml_sql(f'alter table "{self.staging_table_name}" drop column "id"')
def handle(self, *args, **options): self.apply_corrections = options["apply_corrections"] self.do_not_recreate_temp_tables = options[ "do_not_recreate_temp_tables"] self.nuke_temp_tables = options["nuke_temp_tables"] self.do_not_create_fabs_delete_files = options[ "do_not_create_fabs_delete_files"] with Timer("Re-synchronize FABS and FPDS transactions"): for task in TASKS: getattr(self, task["run_me"])(**task) self.document_artifacts() if not self.apply_corrections: logger.info( "--apply-corrections switch not supplied. Bailing before any data are harmed." ) return self.delete_fabs_source_records() self.add_fabs_source_records() self.delete_fpds_source_records() self.add_fpds_source_records() with transaction.atomic(): self.delete_and_add_fabs_transaction_records() self.delete_and_add_fpds_transaction_records() if self.nuke_temp_tables: self.drop_temp_tables()
def handle(self, *args, **options): """Overloaded Command Entrypoint""" with Timer(__name__): self.faux_init(options) self.generate_matview_sql() self.create_views() if not self.no_cleanup: self.cleanup()
def _execute_function(function: Callable, timer_message: Optional[str] = None, *args: Any, **kwargs: Any) -> Any: """ Execute a function and returns its results. Times the execution if there's a timer_message. Logs the result if there's a log_message. """ with Timer(timer_message): return function(*args, **kwargs)
def handle(self, *args, **options): chunk_count = options["chunk_count"] index_concurrency = options["index_concurrency"] self.matview_dir = options["matview_dir"] logger.info(f"Chunk Count: {chunk_count}") with Timer("Recreating table"): self.recreate_matview() with Timer("Inserting data into table"): self.insert_matview_data(chunk_count) with Timer("Creating table indexes"): self.create_indexes(index_concurrency) with Timer("Swapping Tables/Indexes"): self.swap_matviews() if not options["keep_old_data"]: with Timer("Clearing old table"): self.remove_old_data(chunk_count) if not options["keep_matview_data"]: with Timer("Emptying Matviews"): self.empty_matviews() with Timer("Granting Table Permissions"): self.grant_matview_permissions()
def main(self, options): if options["inspect"]: self.inspect() return with Timer(f"{__name__}"): self.part_one() self.part_two() self.part_three()
def _validate_input_file_header(self): expected_header = ( "FEATURE_ID|FEATURE_NAME|FEATURE_CLASS|CENSUS_CODE|CENSUS_CLASS_CODE|GSA_CODE|OPM_CODE|STATE_NUMERIC|" "STATE_ALPHA|COUNTY_SEQUENCE|COUNTY_NUMERIC|COUNTY_NAME|PRIMARY_LATITUDE|PRIMARY_LONGITUDE|" "DATE_CREATED|DATE_EDITED\n" ) with Timer("Validating input file header"): with open(self.working_file, encoding="utf-8-sig") as csv_file: header = csv_file.readline() if header != expected_header: raise RuntimeError(f"Found header does not match expected header. Invalid file format.")
def _unzip_file(file_path): """ ZIP file context manager. If the file pointed to by file_path is a ZIP file, extracts file to a temporary location, yields, and cleans up afterwards. Otherwise, effectively does nothing. """ if zipfile.is_zipfile(file_path): with TemporaryDirectory() as temp_dir: with Timer("Unzip file"): unzipped_file_path = extract_single_file_zip(file_path, temp_dir) yield unzipped_file_path else: yield file_path
def _import_input_file(self): import_command = ( f'copy "{self.staging_table_name}" (feature_id, feature_name, feature_class, census_code, ' f"census_class_code, gsa_code, opm_code, state_numeric, state_alpha, county_sequence, county_numeric, " f"county_name, primary_latitude, primary_longitude, date_created, date_edited) from stdin with " f"(format csv, header, delimiter '|')" ) with Timer("Importing file to staging table"): with connection.cursor() as cursor: with open(self.working_file, encoding="utf-8-sig") as csv_file: cursor.cursor.copy_expert(import_command, csv_file, size=10485760) # 10MB logger.info(f"{cursor.cursor.rowcount:,} rows imported")
def delete_and_add_fabs_transaction_records(): from usaspending_api.broker.helpers.delete_fabs_transactions import delete_fabs_transactions from usaspending_api.broker.helpers.upsert_fabs_transactions import upsert_fabs_transactions with Timer("Insert/delete FABS transactions"): delete_ids = get_ids(TEMP_TRANSACTION_FABS_DELETE_IDS_TABLE) add_ids = get_ids(TEMP_TRANSACTION_FABS_ADD_IDS_TABLE) if not delete_ids and not add_ids: logger.info("No FABS transaction records to add or delete") return update_award_ids = delete_fabs_transactions(delete_ids) upsert_fabs_transactions(add_ids, update_award_ids)
def handle(self, *args, **options): self.warning_status = False self.metrics = OpsReporter(job_name="dev_4109_repair_awards.py") start = perf_counter() with Timer("dev_4109_repair_awards.py"): self.main(options) self.metrics["end_status"] = 3 if self.warning_status else 0 self.metrics["duration"] = perf_counter() - start if options["metrics"]: self.metrics.dump_to_file(options["metrics"]) logger.info("Complete")