def perform_load(self, ids_to_delete, ids_to_insert): if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") if len(ids_to_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(ids_to_insert) # Update Awards based on changed FPDS records with timer("updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update FPDS-specific Awards based on the info in child transactions with timer("updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FPDS records with timer("updating award category variables", logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture")
def upsert_fabs_transactions(ids_to_upsert, externally_updated_award_ids): if ids_to_upsert or externally_updated_award_ids: update_award_ids = copy(externally_updated_award_ids) if ids_to_upsert: with timer("inserting new FABS data", logger.info): update_award_ids.extend(insert_all_new_fabs(ids_to_upsert)) if update_award_ids: update_award_ids = tuple(set( update_award_ids)) # Convert to tuple and remove duplicates. with timer( "updating awards to reflect their latest associated transaction info", logger.info): award_record_count = update_awards(update_award_ids) logger.info( "{} awards updated from their transactional data".format( award_record_count)) with timer("updating awards with executive compensation data", logger.info): award_record_count = update_assistance_awards(update_award_ids) logger.info( "{} awards updated FABS-specific and exec comp data". format(award_record_count)) with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("assistance") else: logger.info("Nothing to insert...")
def handle(self, *args, **options): logger.info("==== Starting FPDS nightly data load ====") if options.get("date"): date = options.get("date")[0] date = datetime.strptime(date, "%Y-%m-%d").date() else: data_load_date_obj = ExternalDataLoadDate.objects.filter( external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT["fpds"] ).first() if not data_load_date_obj: date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") else: date = data_load_date_obj.last_load_date start_date = datetime.utcnow().strftime("%Y-%m-%d") logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") with timer("retrieval of new/modified FPDS data ID list", logger.info): total_insert = self.get_fpds_transaction_ids(date=date) if len(total_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(total_insert) # Update Awards based on changed FPDS records with timer("updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update FPDS-specific Awards based on the info in child transactions with timer("updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FPDS records with timer("updating award category variables", logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture") # Update the date for the last time the data load was run ExternalDataLoadDate.objects.filter(external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT["fpds"]).delete() ExternalDataLoadDate( last_load_date=start_date, external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT["fpds"] ).save() logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def handle(self, *args, **options): logger.info('Starting FABS nightly data load...') # Use date provided or pull most recent ExternalDataLoadDate if options.get('date'): date = options.get('date')[0] else: data_load_date_obj = ExternalDataLoadDate.objects. \ filter(external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fabs']).first() if not data_load_date_obj: date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%d') else: date = data_load_date_obj.last_load_date start_date = datetime.utcnow().strftime('%Y-%m-%d') logger.info('Processing data for FABS starting from %s' % date) # Retrieve FABS data with timer('retrieving/diff-ing FABS Data', logger.info): to_insert, ids_to_delete = self.get_fabs_data(date=date) total_rows = len(to_insert) total_rows_delete = len(ids_to_delete) if total_rows_delete > 0: # Create a file with the deletion IDs and place in a bucket for ElasticSearch self.send_deletes_to_s3(ids_to_delete) # Delete FABS records by ID with timer('deleting stale FABS data', logger.info): self.delete_stale_fabs(ids_to_delete=ids_to_delete) else: logger.info('Nothing to delete...') if total_rows > 0: # Add FABS records with timer('inserting new FABS data', logger.info): self.insert_new_fabs(to_insert=to_insert, total_rows=total_rows) # Update Awards based on changed FABS records with timer('updating awards to reflect their latest associated transaction info', logger.info): update_awards(tuple(award_update_id_list)) # Update AwardCategories based on changed FABS records with timer('updating award category variables', logger.info): update_award_categories(tuple(award_update_id_list)) # Check the linkages from file C to FABS records and update any that are missing with timer('updating C->D linkages', logger.info): update_c_to_d_linkages('assistance') else: logger.info('Nothing to insert...') # Update the date for the last time the data load was run ExternalDataLoadDate.objects.filter(external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fabs']).delete() ExternalDataLoadDate(last_load_date=start_date, external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fabs']).save() logger.info('FABS NIGHTLY UPDATE FINISHED!')
def handle(self, *args, **options): logger.info('Starting FPDS nightly data load...') if options.get('date'): date = options.get('date')[0] date = datetime.strptime(date, '%Y-%m-%d').date() else: data_load_date_obj = ExternalDataLoadDate.objects. \ filter(external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fpds']).first() if not data_load_date_obj: date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%d') else: date = data_load_date_obj.last_load_date start_date = datetime.utcnow().strftime('%Y-%m-%d') logger.info('Processing data for FPDS starting from %s' % date) with timer('retrieving/diff-ing FPDS Data', logger.info): to_insert, ids_to_delete = self.get_fpds_data(date=date) total_rows = len(to_insert) total_rows_delete = len(ids_to_delete) if total_rows_delete > 0: with timer('deleting stale FPDS data', logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info('Nothing to delete...') if total_rows > 0: # Add FPDS records with timer('inserting new FPDS data', logger.info): self.insert_new_fpds(to_insert=to_insert, total_rows=total_rows) # Update Awards based on changed FPDS records with timer('updating awards to reflect their latest associated transaction info', logger.info): update_awards(tuple(award_update_id_list)) # Update FPDS-specific Awards based on the info in child transactions with timer('updating contract-specific awards to reflect their latest transaction info', logger.info): update_contract_awards(tuple(award_update_id_list)) # Update AwardCategories based on changed FPDS records with timer('updating award category variables', logger.info): update_award_categories(tuple(award_update_id_list)) # Check the linkages from file C to FPDS records and update any that are missing with timer('updating C->D linkages', logger.info): update_c_to_d_linkages('contract') else: logger.info('Nothing to insert...') # Update the date for the last time the data load was run ExternalDataLoadDate.objects.filter(external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fpds']).delete() ExternalDataLoadDate(last_load_date=start_date, external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fpds']).save() logger.info('FPDS NIGHTLY UPDATE FINISHED!')
def handle(self, *args, **options): logger.info("==== Starting FPDS nightly data load ====") if options.get("date"): date = options.get("date")[0] date = datetime.strptime(date, "%Y-%m-%d").date() else: default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1) date = get_last_load_date("fpds", default=default_last_load_date).date() processing_start_datetime = datetime.now(timezone.utc) logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") with timer("retrieval of new/modified FPDS data ID list", logger.info): total_insert = self.get_fpds_transaction_ids(date=date) if len(total_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(total_insert) # Update Awards based on changed FPDS records with timer("updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update FPDS-specific Awards based on the info in child transactions with timer("updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FPDS records with timer("updating award category variables", logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture") # Update the date for the last time the data load was run update_last_load_date("fpds", processing_start_datetime) logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def update_award_records(awards, skip_cd_linkage=True): if awards: unique_awards = set(awards) logger.info(f"{len(unique_awards)} award records impacted by transaction DML operations") logger.info(f"{prune_empty_awards(tuple(unique_awards))} award records removed") logger.info(f"{update_awards(tuple(unique_awards))} award records updated") logger.info( f"{update_procurement_awards(tuple(unique_awards))} award records updated on FPDS-specific fields" ) if not skip_cd_linkage: update_c_to_d_linkages("contract") else: logger.info("No award records to update")
def load_file_c(submission_attributes, db_cursor, certified_award_financial): """ Process and load file C broker data. Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we # change up the order of file loading if certified_award_financial.count == 0: logger.warning("No File C (award financial) data found, skipping...") return reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} total_rows = certified_award_financial.count start_time = datetime.now() bulk_treasury_appropriation_account_tas_lookup( certified_award_financial.tas_ids, db_cursor) _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped_tas, submission_attributes, reverse) update_c_to_d_linkages("contract", False, submission_attributes.submission_id) update_c_to_d_linkages("assistance", False, submission_attributes.submission_id) for key in skipped_tas: logger.info( f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}" ) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]["count"] logger.info( f"Skipped a total of {total_tas_skipped:,} TAS rows for File C")
def upsert_fabs_transactions(ids_to_upsert, externally_updated_award_ids): if ids_to_upsert or externally_updated_award_ids: update_award_ids = copy(externally_updated_award_ids) if ids_to_upsert: with timer("inserting new FABS data", logger.info): update_award_ids.extend(insert_all_new_fabs(ids_to_upsert)) if update_award_ids: update_award_ids = tuple(set(update_award_ids)) # Convert to tuple and remove duplicates. with timer("updating awards to reflect their latest associated transaction info", logger.info): update_awards(update_award_ids) with timer("updating award category variables", logger.info): update_award_categories(update_award_ids) with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("assistance") else: logger.info("Nothing to insert...")
def load_file_c(submission_attributes, db_cursor, certified_award_financial): """ Process and load file C broker data. Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ if certified_award_financial.count == 0: logger.warning("No File C (award financial) data found, skipping...") return # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we # change up the order of file loading reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") skipped_tas = defaultdict( int) # tracks count of rows skipped due to "missing" TAS total_rows = certified_award_financial.count start_time = datetime.now() bulk_treasury_appropriation_account_tas_lookup( certified_award_financial.tas_ids, db_cursor) _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped_tas, submission_attributes, reverse) update_c_to_d_linkages("contract", False, submission_attributes.submission_id) update_c_to_d_linkages("assistance", False, submission_attributes.submission_id) for tas, count in skipped_tas.items(): logger.info(f"Skipped {count:,} rows due to {tas}") total_tas_skipped = sum([count for count in skipped_tas.values()]) if total_tas_skipped > 0: logger.info( f"SKIPPED {total_tas_skipped:,} ROWS of File C (missing TAS)") else: logger.info("All File C records in Broker loaded into USAspending")
def upsert_fabs_transactions(ids_to_upsert, externally_updated_award_ids): if ids_to_upsert or externally_updated_award_ids: update_award_ids = copy(externally_updated_award_ids) if ids_to_upsert: with timer("inserting new FABS data", logger.info): update_award_ids.extend(insert_all_new_fabs(ids_to_upsert)) if update_award_ids: update_award_ids = tuple(set( update_award_ids)) # Convert to tuple and remove duplicates. with timer( "updating awards to reflect their latest associated transaction info", logger.info): update_awards(update_award_ids) with timer("updating award category variables", logger.info): update_award_categories(update_award_ids) with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("assistance") else: logger.info("Nothing to insert...")
def handle(self, *args, **options): logger.info("Starting FABS data load script...") start_date = datetime.now(timezone.utc).strftime('%Y-%m-%d') fabs_load_db_id = lookups.EXTERNAL_DATA_TYPE_DICT['fabs'] data_load_date_obj = ExternalDataLoadDate.objects.filter( external_data_type_id=fabs_load_db_id).first() if options.get("date"): # if provided, use cli data load_from_date = options.get("date")[0] elif data_load_date_obj: # else if last run is in DB, use that load_from_date = data_load_date_obj.last_load_date else: # Default is yesterday at midnight load_from_date = (datetime.now(timezone.utc) - timedelta(days=1)).strftime('%Y-%m-%d') logger.info('Processing data for FABS starting from %s' % load_from_date) with timer('retrieving/diff-ing FABS Data', logger.info): upsert_transactions = self.get_fabs_transaction_ids( date=load_from_date) with timer("obtaining delete records", logger.info): ids_to_delete = self.get_fabs_records_to_delete( date=load_from_date) if ids_to_delete: self.store_deleted_fabs(ids_to_delete) # Delete FABS records by ID with timer("deleting stale FABS data", logger.info): self.delete_stale_fabs(ids_to_delete=ids_to_delete) del ids_to_delete else: logger.info("Nothing to delete...") if upsert_transactions: # Add FABS records with timer('inserting new FABS data', logger.info): self.insert_all_new_fabs(all_new_to_insert=upsert_transactions) # Update Awards based on changed FABS records with timer( 'updating awards to reflect their latest associated transaction info', logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FABS records with timer('updating award category variables', logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FABS records and update any that are missing with timer('updating C->D linkages', logger.info): update_c_to_d_linkages('assistance') else: logger.info('Nothing to insert...') # Update the date for the last time the data load was run ExternalDataLoadDate.objects.filter( external_data_type_id=lookups.EXTERNAL_DATA_TYPE_DICT['fabs'] ).delete() ExternalDataLoadDate(last_load_date=start_date, external_data_type_id=lookups. EXTERNAL_DATA_TYPE_DICT['fabs']).save() logger.info('FABS UPDATE FINISHED!')
def handle(self, *args, **options): with transaction.atomic(): for link_type in self.LINKAGE_TYPES: update_c_to_d_linkages(type=link_type)
def run_sql(self, submission=None): for link_type in self.LINKAGE_TYPES: update_c_to_d_linkages(type=link_type, submission_id=submission)