def nightly_loader(self, start_date): logger.info("==== Starting FPDS nightly data load ====") if start_date: date = start_date date = datetime.strptime(date, "%Y-%m-%d").date() else: default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1) date = get_last_load_date("fpds", default=default_last_load_date).date() processing_start_datetime = datetime.now(timezone.utc) logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of new/modified FPDS data ID list", logger.info): ids_to_insert = self.get_fpds_transaction_ids(date=date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) self.perform_load( ids_to_delete, ids_to_insert, ) # Update the date for the last time the data load was run update_last_load_date("fpds", processing_start_datetime) logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def handle(self, *args, **options): logger.info('Starting updating awarding agencies...') fiscal_year = options.get('fiscal_year')[0] page = options.get('page') limit = options.get('limit') page = page[0] if page else 1 limit = limit[0] if limit else 500000 if options.get('contracts', None): with timer('D1 (contracts/FPDS) awarding/funding agencies updates', logger.info): self.update_awarding_funding_agency(fiscal_year, 'D1', page=page, limit=limit) elif options.get('assistance', None): with timer( 'D2 (assistance/FABS) awarding/funding agencies updates', logger.info): self.update_awarding_funding_agency(fiscal_year, 'D2', page=page, limit=limit) else: logger.error('Not a valid data type: --assistance,--contracts') logger.info('Finished')
def perform_load(self, ids_to_delete, ids_to_insert): if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") if len(ids_to_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(ids_to_insert) # Update Awards based on changed FPDS records with timer("updating awards to reflect their latest associated transaction info", logger.info): award_record_count = update_awards(tuple(AWARD_UPDATE_ID_LIST)) logger.info("{} awards updated from their transactional data".format(award_record_count)) # Update FPDS-specific Awards based on the info in child transactions with timer("updating contract-specific awards to reflect their latest transaction info", logger.info): award_record_count = update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) logger.info("{} awards updated FPDS-specific and exec comp data".format(award_record_count)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture")
def upsert_fabs_transactions(ids_to_upsert, externally_updated_award_ids): if ids_to_upsert or externally_updated_award_ids: update_award_ids = copy(externally_updated_award_ids) if ids_to_upsert: with timer("inserting new FABS data", logger.info): update_award_ids.extend(insert_all_new_fabs(ids_to_upsert)) if update_award_ids: update_award_ids = tuple(set( update_award_ids)) # Convert to tuple and remove duplicates. with timer( "updating awards to reflect their latest associated transaction info", logger.info): award_record_count = update_awards(update_award_ids) logger.info( "{} awards updated from their transactional data".format( award_record_count)) with timer("updating awards with executive compensation data", logger.info): award_record_count = update_assistance_awards(update_award_ids) logger.info( "{} awards updated FABS-specific and exec comp data". format(award_record_count)) with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("assistance") else: logger.info("Nothing to insert...")
def handle(self, *args, **options): logger.info("Starting updating awarding agencies...") fiscal_year = options.get("fiscal_year")[0] page = options.get("page") limit = options.get("limit") page = page[0] if page else 1 limit = limit[0] if limit else 500000 if options.get("contracts", None): with timer("D1 (contracts/FPDS) awarding/funding agencies updates", logger.info): self.update_awarding_funding_agency(fiscal_year, "D1", page=page, limit=limit) elif options.get("assistance", None): with timer( "D2 (assistance/FABS) awarding/funding agencies updates", logger.info): self.update_awarding_funding_agency(fiscal_year, "D2", page=page, limit=limit) else: logger.error("Not a valid data type: --assistance,--contracts") logger.info("Finished")
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: ids = None afa_ids = None start_datetime = None end_datetime = None else: ids = options["ids"] afa_ids = set(options["afa_ids"]) if options["afa_id_file"]: afa_ids = tuple( afa_ids | read_afa_ids_from_file(options["afa_id_file"])) start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any( (reload_all, ids, afa_ids, start_datetime, end_datetime)) if is_incremental_load: start_datetime = get_incremental_load_start_datetime() logger.info( f"Processing data for FABS starting from {start_datetime} (includes offset)" ) # We only perform deletes with incremental loads. with timer("obtaining delete records", logger.info): delete_records = retrieve_deleted_fabs_transactions( start_datetime, end_datetime) ids_to_delete = [ item for sublist in delete_records.values() for item in sublist if item ] ids_to_delete = get_delete_pks_for_afa_keys(ids_to_delete) logger.info(f"{len(ids_to_delete):,} delete ids found in total") with timer("retrieving IDs of FABS to process", logger.info): ids_to_upsert = get_fabs_transaction_ids(ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions( ids_to_delete) if is_incremental_load else [] upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: logger.info( f"Storing {processing_start_datetime} for the next incremental run" ) update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") do_not_log_deletions = options["do_not_log_deletions"] # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: submission_ids = None afa_ids = None start_datetime = None end_datetime = None else: submission_ids = tuple(options["submission_ids"] ) if options["submission_ids"] else None afa_ids = read_afa_ids_from_file( options['afa_id_file']) if options['afa_id_file'] else None start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any((reload_all, submission_ids, afa_ids, start_datetime, end_datetime)) if is_incremental_load: last_load_date = get_last_load_date() submission_ids = get_new_submission_ids(last_load_date) logger.info("Processing data for FABS starting from %s" % last_load_date) if is_incremental_load and not submission_ids: logger.info("No new submissions. Exiting.") else: with timer("obtaining delete records", logger.info): ids_to_delete = get_fabs_records_to_delete( submission_ids, afa_ids, start_datetime, end_datetime) with timer("retrieving/diff-ing FABS Data", logger.info): ids_to_upsert = get_fabs_transaction_ids( submission_ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions( ids_to_delete, do_not_log_deletions) upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): logger.info("Starting historical data load...") db_cursor = connections["data_broker"].cursor() fiscal_year = options.get("fiscal_year") page = options.get("page") limit = options.get("limit") if fiscal_year: fiscal_year = fiscal_year[0] logger.info("Processing data for Fiscal Year " + str(fiscal_year)) else: fiscal_year = 2017 page = page[0] if page else 1 limit = limit[0] if limit else 500000 if not options["assistance"]: with timer("D1 historical data load", logger.info): self.update_transaction_contract(db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit) if not options["contracts"]: with timer("D2 historical data load", logger.info): self.update_transaction_assistance(db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit) with timer( "updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(award_update_id_list)) with timer( "updating assistance-specific awards to reflect their latest transaction info", logger.info): update_assistance_awards(tuple(award_assistance_update_id_list)) with timer( "updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(award_contract_update_id_list)) # Done! logger.info("FINISHED")
def test_timer(capsys): """Verify that timer helper executes without error""" with timer(): print("Doing a thing") output = capsys.readouterr()[0] assert "Beginning" in output assert "finished" in output
def test_timer_times(capsys): """Verify that timer shows longer times for slower operations""" pattern = re.compile(r"([\d\.e\-]+)s") with timer(): print("Doing a thing") output0 = capsys.readouterr()[0] time0 = float(pattern.search(output0).group(1)) with timer(): print("Doing a slower thing") time.sleep(0.1) output1 = capsys.readouterr()[0] time1 = float(pattern.search(output1).group(1)) assert time1 > time0
def handle(self, *args, **options): logger.info('Starting historical data load...') db_cursor = connections['data_broker'].cursor() fiscal_year = options.get('fiscal_year') page = options.get('page') limit = options.get('limit') if fiscal_year: fiscal_year = fiscal_year[0] logger.info('Processing data for Fiscal Year ' + str(fiscal_year)) else: fiscal_year = 2017 page = page[0] if page else 1 limit = limit[0] if limit else 500000 if not options['assistance']: with timer('D1 historical data load', logger.info): self.update_transaction_contract(db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit) if not options['contracts']: with timer('D2 historical data load', logger.info): self.update_transaction_assistance(db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit) with timer( 'updating awards to reflect their latest associated transaction info', logger.info): update_awards(tuple(award_update_id_list)) with timer( 'updating contract-specific awards to reflect their latest transaction info', logger.info): update_contract_awards(tuple(award_contract_update_id_list)) with timer('updating award category variables', logger.info): update_award_categories(tuple(award_update_id_list)) # Done! logger.info('FINISHED')
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: afa_ids = None start_datetime = None end_datetime = None else: afa_ids = read_afa_ids_from_file( options["afa_id_file"]) if options["afa_id_file"] else None start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any( (reload_all, afa_ids, start_datetime, end_datetime)) if is_incremental_load: start_datetime = get_last_load_date() logger.info("Processing data for FABS starting from %s" % start_datetime) with timer("obtaining delete records", logger.info): delete_records = retrieve_deleted_fabs_transactions( start_datetime, end_datetime) ids_to_delete = [ item for sublist in delete_records.values() for item in sublist if item ] with timer("retrieving/diff-ing FABS Data", logger.info): ids_to_upsert = get_fabs_transaction_ids(afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions(ids_to_delete) upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): logger.info('Starting updates to award data...') all_records_flag = options.get('all') fiscal_year = options.get('fiscal_year') award_update_id_list = [] award_contract_update_id_list = [] if not all_records_flag: if fiscal_year: fiscal_year = fiscal_year[0] logger.info('Processing data for Fiscal Year ' + str(fiscal_year)) else: fiscal_year = 2017 # Lists to store for update_awards and update_contract_awards award_update_id_list = TransactionNormalized.objects.filter(action_date__fy=fiscal_year).\ values_list('award_id', flat=True) award_contract_update_id_list = TransactionFPDS.objects.filter(action_date__fy=fiscal_year).\ values_list('transaction__award_id', flat=True) with timer( 'updating awards to reflect their latest associated transaction info', logger.info): update_awards() if all_records_flag else update_awards( tuple(award_update_id_list)) with timer( 'updating contract-specific awards to reflect their latest transaction info...', logger.info): if all_records_flag: update_contract_awards() else: update_contract_awards(tuple(award_contract_update_id_list)) with timer('updating award category variables', logger.info): update_award_categories( ) if all_records_flag else update_award_categories( tuple(award_update_id_list)) # Done! logger.info('FINISHED')
def delete_fabs_transactions(ids_to_delete): """ids_to_delete are published_award_financial_assistance_ids""" if ids_to_delete: with timer(f"deleting {len(ids_to_delete)} stale FABS data", logger.info): update_award_ids = delete_stale_fabs(ids_to_delete) else: update_award_ids = [] logger.info("Nothing to delete...") return update_award_ids
def delete_fabs_transactions(ids_to_delete): """ ids_to_delete are afa_generated_unique ids """ if ids_to_delete: with timer(f"deleting {len(ids_to_delete)} stale FABS data", logger.info): update_award_ids = delete_stale_fabs(ids_to_delete) else: update_award_ids = [] logger.info("Nothing to delete...") return update_award_ids
def upsert_fabs_transactions(ids_to_upsert, externally_updated_award_ids): if ids_to_upsert or externally_updated_award_ids: update_award_ids = copy(externally_updated_award_ids) if ids_to_upsert: with timer("inserting new FABS data", logger.info): update_award_ids.extend(insert_all_new_fabs(ids_to_upsert)) if update_award_ids: update_award_ids = tuple(set( update_award_ids)) # Convert to tuple and remove duplicates. with timer( "updating awards to reflect their latest associated transaction info", logger.info): update_awards(update_award_ids) with timer("updating award category variables", logger.info): update_award_categories(update_award_ids) with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("assistance") else: logger.info("Nothing to insert...")
def handle(self, *args, **options): logger.info("Starting historical data load...") db_cursor = connections["data_broker"].cursor() fiscal_year = options.get("fiscal_year") page = options.get("page") limit = options.get("limit") save = options.get("save") if fiscal_year: fiscal_year = fiscal_year[0] logger.info("Processing data for Fiscal Year " + str(fiscal_year)) else: fiscal_year = 2017 page = page[0] if page else 1 limit = limit[0] if limit else 500000 if not options["assistance"]: with timer("D1 historical data location insert", logger.info): self.update_location_transaction_contract( db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit, save=save) if not options["contracts"]: with timer("D2 historical data location insert", logger.info): self.update_location_transaction_assistance( db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit, save=save) logger.info("FINISHED")
def handle(self, *args, **options): logger.info('Starting historical data load...') db_cursor = connections['data_broker'].cursor() fiscal_year = options.get('fiscal_year') page = options.get('page') limit = options.get('limit') save = options.get('save') if fiscal_year: fiscal_year = fiscal_year[0] logger.info('Processing data for Fiscal Year ' + str(fiscal_year)) else: fiscal_year = 2017 page = page[0] if page else 1 limit = limit[0] if limit else 500000 if not options['assistance']: with timer('D1 historical data location insert', logger.info): self.update_location_transaction_contract( db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit, save=save) if not options['contracts']: with timer('D2 historical data location insert', logger.info): self.update_location_transaction_assistance( db_cursor=db_cursor, fiscal_year=fiscal_year, page=page, limit=limit, save=save) logger.info('FINISHED')
def delete_fabs_transactions(ids_to_delete, do_not_log_deletions): """ ids_to_delete are afa_generated_unique ids """ if ids_to_delete: if do_not_log_deletions is False: store_deleted_fabs(ids_to_delete) with timer("deleting stale FABS data", logger.info): update_award_ids = delete_stale_fabs(ids_to_delete) else: update_award_ids = [] logger.info("Nothing to delete...") return update_award_ids
def handle(self, *args, **options): """ Updates the column ussgl498100_upward_adjust_pri_deliv_orders_oblig_unpaid_cpe due to the incorrect mapping in settings.py """ ds_cursor = connection.cursor() logger.info("Begin updating file B and C") broker_cols_b = self.get_list_of_broker_cols(financial_accounts_oc) broker_cols_type_b = self.get_list_of_broker_cols_types( financial_accounts_oc) website_cols_b = self.get_website_row_formatted(financial_accounts_oc) website_update_text_b = self.get_cols_to_update(financial_accounts_oc) website_cols_joins_b = self.get_file_table_joins(financial_accounts_oc) broker_cols_c = self.get_list_of_broker_cols(financial_accounts_awards) broker_cols_type_c = self.get_list_of_broker_cols_types( financial_accounts_awards) website_cols_c = self.get_website_row_formatted( financial_accounts_awards) website_update_text_c = self.get_cols_to_update( financial_accounts_awards) website_cols_joins_c = self.get_file_table_joins( financial_accounts_awards) with timer("getting submission ids to update", logger.info): submissions_to_update = self.get_list_of_submissions() for submission in submissions_to_update: submission_id = submission[0] # File B Updates logger.info( "loading rows data to update File B submission {}".format( submission_id)) with timer( "retrieving rows to update for File B submission {}". format(submission_id), logger.info): get_rows_to_update_query = self.get_rows_to_update( "B", submission_id, broker_cols_b, broker_cols_type_b, website_cols_b) ds_cursor.execute(get_rows_to_update_query) with timer( "updating rows for File B submission {}".format( submission_id), logger.info): update_rows = self.update_website_rows( "financial_accounts_by_program_activity_object_class", "file_b_rows_to_update", website_update_text_b, website_cols_joins_b, ) ds_cursor.execute(update_rows) # File C updates with timer( "retrieving rows to update for File C submission {}". format(submission_id), logger.info): get_rows_to_update_query = self.get_rows_to_update( "C", submission_id, broker_cols_c, broker_cols_type_c, website_cols_c) ds_cursor.execute(get_rows_to_update_query) with timer( "updating rows for File C submission {}".format( submission_id), logger.info): update_rows = self.update_website_rows( "financial_accounts_by_awards", "file_c_rows_to_update", website_update_text_c, website_cols_joins_c) ds_cursor.execute(update_rows) ds_cursor.execute("DROP TABLE file_b_rows_to_update") ds_cursor.execute("DROP TABLE file_c_rows_to_update") logger.info("Done updating file B and C mappings")