def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): """ Process and load file B broker data (aka TAS balances by program activity and object class). """ reverse = re.compile(r'(_(cpe|fyb)$)|^transaction_obligated_amount$') test_counter = 0 for row in prg_act_obj_cls_data: test_counter += 1 account_balances = None try: # Check and see if there is an entry for this TAS treasury_account = get_treasury_appropriation_account_tas_lookup( row.get('tas_id'), db_cursor) if treasury_account is None: raise Exception( 'Could not find appropriation account for TAS: ' + row['tas']) except: continue # get the corresponding account balances row (aka "File A" record) account_balances = AppropriationAccountBalances.objects.get( treasury_account_identifier=treasury_account, submission_id=submission_attributes.submission_id) financial_by_prg_act_obj_cls = FinancialAccountsByProgramActivityObjectClass( ) value_map = { 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end, 'treasury_account': treasury_account, 'appropriation_account_balances': account_balances, 'object_class': get_or_create_object_class(row['object_class'], row['by_direct_reimbursable_fun'], logger), 'program_activity': get_or_create_program_activity(row, submission_attributes) } load_data_into_model(financial_by_prg_act_obj_cls, row, value_map=value_map, save=True, reverse=reverse) # Insert File B quarterly numbers for this submission TasProgramActivityObjectClassQuarterly.insert_quarterly_numbers( submission_attributes.submission_id) FinancialAccountsByProgramActivityObjectClass.populate_final_of_fy()
def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): """ Process and load file B broker data (aka TAS balances by program activity and object class). """ reverse = re.compile(r'(_(cpe|fyb)$)|^transaction_obligated_amount$') # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} test_counter = 0 for row in prg_act_obj_cls_data: test_counter += 1 account_balances = None try: # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get('tas_id'), db_cursor) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue except Exception: # TODO: What is this trying to catch, actually? continue # get the corresponding account balances row (aka "File A" record) account_balances = AppropriationAccountBalances.objects.get( treasury_account_identifier=treasury_account, submission_id=submission_attributes.submission_id ) financial_by_prg_act_obj_cls = FinancialAccountsByProgramActivityObjectClass() value_map = { 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end, 'treasury_account': treasury_account, 'appropriation_account_balances': account_balances, 'object_class': get_or_create_object_class(row['object_class'], row['by_direct_reimbursable_fun'], logger), 'program_activity': get_or_create_program_activity(row, submission_attributes) } load_data_into_model(financial_by_prg_act_obj_cls, row, value_map=value_map, save=True, reverse=reverse) # Insert File B quarterly numbers for this submission TasProgramActivityObjectClassQuarterly.insert_quarterly_numbers(submission_attributes.submission_id) FinancialAccountsByProgramActivityObjectClass.populate_final_of_fy() for key in skipped_tas: logger.info('Skipped %d rows due to missing TAS: %s', skipped_tas[key]['count'], key) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]['count'] logger.info('Skipped a total of {} TAS rows for File B'.format(total_tas_skipped))
def load_file_a(submission_attributes, appropriation_data, db_cursor): """ Process and load file A broker data (aka TAS balances, aka appropriation account balances). """ reverse = re.compile("gross_outlay_amount_by_tas_cpe") # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} # Create account objects for row in appropriation_data: # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get("tas_id"), db_cursor) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # Now that we have the account, we can load the appropriation balances # TODO: Figure out how we want to determine what row is overriden by what row # If we want to correlate, the following attributes are available in the data broker data that might be useful: # appropriation_id, row_number appropriation_balances = somethingsomething get appropriation balances... appropriation_balances = AppropriationAccountBalances() value_map = { "treasury_account_identifier": treasury_account, "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, } field_map = {} load_data_into_model(appropriation_balances, row, field_map=field_map, value_map=value_map, save=True, reverse=reverse) AppropriationAccountBalances.populate_final_of_fy() for key in skipped_tas: logger.info( f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}" ) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]["count"] logger.info( f"Skipped a total of {total_tas_skipped:,} TAS rows for File A")
def load_file_a(submission_attributes, appropriation_data, db_cursor): """ Process and load file A broker data (aka TAS balances, aka appropriation account balances). """ reverse = re.compile('gross_outlay_amount_by_tas_cpe') # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} # Create account objects for row in appropriation_data: # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get('tas_id'), db_cursor) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # Now that we have the account, we can load the appropriation balances # TODO: Figure out how we want to determine what row is overriden by what row # If we want to correlate, the following attributes are available in the data broker data that might be useful: # appropriation_id, row_number appropriation_balances = somethingsomething get appropriation balances... appropriation_balances = AppropriationAccountBalances() value_map = { 'treasury_account_identifier': treasury_account, 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end } field_map = {} load_data_into_model(appropriation_balances, row, field_map=field_map, value_map=value_map, save=True, reverse=reverse) AppropriationAccountBalances.populate_final_of_fy() # Insert File A quarterly numbers for this submission AppropriationAccountBalancesQuarterly.insert_quarterly_numbers(submission_attributes.submission_id) for key in skipped_tas: logger.info('Skipped %d rows due to missing TAS: %s', skipped_tas[key]['count'], key) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]['count'] logger.info('Skipped a total of {} TAS rows for File A'.format(total_tas_skipped))
def handle(self, *args, **options): state_data_field_map = { 'fips': 'FIPS', 'code': 'Code', 'name': 'Name', 'type': 'Type', 'year': 'Year', 'population': 'Population', 'pop_source': 'Population Source', 'median_household_income': 'Median Household Income', 'mhi_source': 'Median Household Income Source' } csv_file = options['file'] remote = False if csv_file: if not os.path.exists(csv_file): raise FileExistsError(csv_file) elif os.path.splitext(csv_file)[1] != '.csv': raise Exception('Wrong filetype provided, expecting csv') file_path = csv_file elif not settings.IS_LOCAL and os.environ.get('USASPENDING_AWS_REGION') and os.environ.get('STATE_DATA_BUCKET'): s3connection = boto.s3.connect_to_region(os.environ.get('USASPENDING_AWS_REGION')) s3bucket = s3connection.lookup(os.environ.get('STATE_DATA_BUCKET')) key = s3bucket.get_key(LOCAL_STATE_DATA_FILENAME) file_path = os.path.join('/', 'tmp', LOCAL_STATE_DATA_FILENAME) key.get_contents_to_filename(file_path) remote = True else: file_path = LOCAL_STATE_DATA with open(file_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: # Defaulting to None's instead of ''s row = {key: (value or None) for key, value in row.items()} load_data_into_model( StateData(), row, field_map=state_data_field_map, save=True ) if remote: os.remove(file_path) logger.info('Loading StateData complete')
def load_file_a(submission_attributes, appropriation_data, db_cursor): """ Process and load file A broker data (aka TAS balances, aka appropriation account balances). """ reverse = re.compile('gross_outlay_amount_by_tas_cpe') # Create account objects for row in appropriation_data: # Check and see if there is an entry for this TAS treasury_account = get_treasury_appropriation_account_tas_lookup( row.get('tas_id'), db_cursor) if treasury_account is None: raise Exception('Could not find appropriation account for TAS: ' + row['tas']) # Now that we have the account, we can load the appropriation balances # TODO: Figure out how we want to determine what row is overriden by what row # If we want to correlate, the following attributes are available in the # data broker data that might be useful: appropriation_id, row_number # appropriation_balances = somethingsomething get appropriation balances... appropriation_balances = AppropriationAccountBalances() value_map = { 'treasury_account_identifier': treasury_account, 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end } field_map = {} load_data_into_model(appropriation_balances, row, field_map=field_map, value_map=value_map, save=True, reverse=reverse) AppropriationAccountBalances.populate_final_of_fy() # Insert File A quarterly numbers for this submission AppropriationAccountBalancesQuarterly.insert_quarterly_numbers( submission_attributes.submission_id)
def handle(self, *args, **options): state_data_field_map = { "fips": "FIPS", "code": "Code", "name": "Name", "type": "Type", "year": "Year", "population": "Population", "pop_source": "Population Source", "median_household_income": "Median Household Income", "mhi_source": "Median Household Income Source", } csv_file = options["file"] remote = False if csv_file: if not os.path.exists(csv_file): raise FileExistsError(csv_file) elif os.path.splitext(csv_file)[1] != ".csv": raise Exception("Wrong filetype provided, expecting csv") file_path = csv_file elif not settings.IS_LOCAL and settings.USASPENDING_AWS_REGION and settings.STATE_DATA_BUCKET: s3connection = boto3.resource( "s3", region_name=settings.USASPENDING_AWS_REGION) s3bucket = s3connection.Bucket(settings.STATE_DATA_BUCKET) file_path = os.path.join("/", "tmp", LOCAL_STATE_DATA_FILENAME) s3bucket.download_file(LOCAL_STATE_DATA_FILENAME, file_path) remote = True else: file_path = LOCAL_STATE_DATA with open(file_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: # Defaulting to None's instead of ''s row = {key: (value or None) for key, value in row.items()} load_data_into_model(StateData(), row, field_map=state_data_field_map, save=True) if remote: os.remove(file_path) logger.info("Loading StateData complete")
def update_duns(self, update_duns, update_date): logger.info("Updating {} duns records".format(len(update_duns))) for row in update_duns: equivalent_duns = DUNS.objects.filter(broker_duns_id=row["duns_id"])[0] load_data_into_model( equivalent_duns, row, field_map={ "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu", "legal_business_name": "legal_business_name", "ultimate_parent_unique_ide": "ultimate_parent_unique_ide", "ultimate_parent_legal_enti": "ultimate_parent_legal_enti", "broker_duns_id": "duns_id", }, value_map={"update_date": update_date}, as_dict=False, save=True, )
def get_submission_attributes(submission_id, submission_data): """ For a specified broker submission, return the existing corresponding usaspending submission record or create and return a new one. """ dabs_window = DABSSubmissionWindowSchedule.objects.filter( submission_fiscal_year=submission_data["reporting_fiscal_year"], submission_fiscal_month=submission_data["reporting_fiscal_period"], is_quarter=submission_data["is_quarter_format"], ).first() if not dabs_window: raise RuntimeError( f"Missing DABS Window record necessary for {submission_id}") # check if we already have an entry for this submission id; if not, create one submission_attributes, created = SubmissionAttributes.objects.get_or_create( submission_id=submission_id, defaults={"submission_window": dabs_window}) if created: # this is the first time we're loading this submission logger.info(f"Creating submission {submission_id}") else: # we've already loaded this submission, so delete it before reloading logger.info( f"Submission {submission_id} already exists. It will be deleted.") call_command("rm_submission", submission_id) submission_data["reporting_agency_name"] = retrive_agency_name_from_code( submission_data["toptier_code"]) # Update and save submission attributes field_map = { "reporting_period_start": "reporting_start_date", "reporting_period_end": "reporting_end_date", "quarter_format_flag": "is_quarter_format", } # Create our value map - specific data to load value_map = { "reporting_fiscal_quarter": get_fiscal_quarter(submission_data["reporting_fiscal_period"]) } new_submission = load_data_into_model(submission_attributes, submission_data, field_map=field_map, value_map=value_map, save=True) return new_submission
def get_submission_attributes(broker_submission_id, submission_data): """ For a specified broker submission, return the existing corresponding usaspending submission record or create and return a new one. """ # check if we already have an entry for this broker submission id; if not, create one submission_attributes, created = SubmissionAttributes.\ objects.get_or_create(broker_submission_id=broker_submission_id) if created: # this is the first time we're loading this broker submission logger.info('Creating broker submission id {}'.format(broker_submission_id)) else: # we've already loaded this broker submission, so delete it before reloading if there's another submission that # references this one as a "previous submission" do not proceed. # TODO: now that we're chaining submisisons together, get clarification on what should happen when a submission # in the middle of the chain is deleted TasProgramActivityObjectClassQuarterly.refresh_downstream_quarterly_numbers(submission_attributes.submission_id) logger.info('Broker submission id {} already exists. It will be deleted.'.format(broker_submission_id)) call_command('rm_submission', broker_submission_id) logger.info("Merging CGAC and FREC columns") submission_data["cgac_code"] = submission_data["cgac_code"]\ if submission_data["cgac_code"] else submission_data["frec_code"] # Find the previous submission for this CGAC and fiscal year (if there is one) previous_submission = get_previous_submission( submission_data['cgac_code'], submission_data['reporting_fiscal_year'], submission_data['reporting_fiscal_period']) # Update and save submission attributes field_map = { 'reporting_period_start': 'reporting_start_date', 'reporting_period_end': 'reporting_end_date', 'quarter_format_flag': 'is_quarter_format', } # Create our value map - specific data to load value_map = { 'broker_submission_id': broker_submission_id, 'reporting_fiscal_quarter': get_fiscal_quarter(submission_data['reporting_fiscal_period']), 'previous_submission': None if previous_submission is None else previous_submission, # pull in broker's last update date to use as certified date 'certified_date': submission_data['updated_at'].date() if type( submission_data['updated_at']) == datetime else None, } return load_data_into_model( submission_attributes, submission_data, field_map=field_map, value_map=value_map, save=True)
def update_duns(self, update_duns, update_date): logger.info('Updating {} duns records'.format(len(update_duns))) for row in update_duns: equivalent_duns = DUNS.objects.filter( broker_duns_id=row['duns_id'])[0] load_data_into_model(equivalent_duns, row, field_map={ 'awardee_or_recipient_uniqu': 'awardee_or_recipient_uniqu', 'legal_business_name': 'legal_business_name', 'ultimate_parent_unique_ide': 'ultimate_parent_unique_ide', 'ultimate_parent_legal_enti': 'ultimate_parent_legal_enti', 'broker_duns_id': 'duns_id' }, value_map={'update_date': update_date}, as_dict=False, save=True)
def load_transaction_normalized(self, fabs_broker_data, total_rows): start_time = datetime.now() for index, row in enumerate(fabs_broker_data, 1): if not (index % 10000): logger.info( 'Transaction Normalized: Loading row {} of {} ({})'.format( str(index), str(total_rows), datetime.now() - start_time)) parent_txn_value_map = { "award": award_lookup[index - 1], "awarding_agency": awarding_agency_list[index - 1], "funding_agency": funding_agency_list[index - 1], "recipient": legal_entity_lookup[index - 1], "place_of_performance": pop_bulk[index - 1], "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), "last_modified_date": row['modified_at'] } fad_field_map = { "type": "assistance_type", "description": "award_description", } transaction_normalized = load_data_into_model( TransactionNormalized(), row, field_map=fad_field_map, value_map=parent_txn_value_map, as_dict=False, save=False) transaction_normalized.fiscal_year = fy( transaction_normalized.action_date) transaction_normalized_bulk.append(transaction_normalized) logger.info( 'Bulk creating Transaction Normalized (batch_size: {})...'.format( BATCH_SIZE)) TransactionNormalized.objects.bulk_create(transaction_normalized_bulk, batch_size=BATCH_SIZE)
def _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped_tas, submission_attributes, reverse): save_manager = BulkCreateManager(FinancialAccountsByAwards) for index, row in enumerate(certified_award_financial, 1): if not (index % 1000): logger.info(f"C File Load: Loading row {index:,} of {total_rows:,} ({datetime.now() - start_time})") upper_case_dict_values(row) # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get("tas_id")) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # Find a matching transaction record, so we can use its subtier agency information to match to (or create) an # Award record. # Find the award that this award transaction belongs to. If it doesn't exist, create it. filters = {} if row.get("piid"): filters["piid"] = row.get("piid") filters["parent_piid"] = row.get("parent_award_id") else: if row.get("fain") and not row.get("uri"): filters["fain"] = row.get("fain") elif row.get("uri") and not row.get("fain"): filters["uri"] = row.get("uri") else: filters["fain"] = row.get("fain") filters["uri"] = row.get("uri") award_financial_data = FinancialAccountsByAwards() value_map_faba = { "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, "treasury_account": treasury_account, "object_class": row.get("object_class"), "program_activity": row.get("program_activity"), "disaster_emergency_fund": get_disaster_emergency_fund(row), } save_manager.append( load_data_into_model(award_financial_data, row, value_map=value_map_faba, save=False, reverse=reverse) ) save_manager.save_stragglers()
def load_file_a(submission_attributes, appropriation_data, db_cursor): """ Process and load file A broker data (aka TAS balances, aka appropriation account balances). """ reverse = re.compile("gross_outlay_amount_by_tas_cpe") skipped_tas = defaultdict(int) # tracks count of rows skipped due to "missing" TAS bulk_treasury_appropriation_account_tas_lookup(appropriation_data, db_cursor) # Create account objects save_manager = BulkCreateManager(AppropriationAccountBalances) for row in appropriation_data: # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get("tas_id")) if treasury_account is None: skipped_tas[tas_rendering_label] += 1 continue # Now that we have the account, we can load the appropriation balances # TODO: Figure out how we want to determine what row is overridden by what row # If we want to correlate, the following attributes are available in the data broker data that might be useful: # appropriation_id, row_number appropriation_balances = something something get appropriation balances... appropriation_balances = AppropriationAccountBalances() value_map = { "treasury_account_identifier": treasury_account, "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, } field_map = {} save_manager.append( load_data_into_model( appropriation_balances, row, field_map=field_map, value_map=value_map, save=False, reverse=reverse ) ) save_manager.save_stragglers() for tas, count in skipped_tas.items(): logger.info(f"Skipped {count:,} rows due to {tas}") total_tas_skipped = sum([count for count in skipped_tas.values()]) if total_tas_skipped > 0: logger.info(f"SKIPPED {total_tas_skipped:,} ROWS of File A (missing TAS)") else: logger.info("All File A records in Broker loaded into USAspending")
def add_duns(self, new_duns, update_date): logger.info('Adding {} duns records'.format(len(new_duns))) new_records = [] for row in new_duns: new_record = load_data_into_model( DUNS(), row, field_map={ 'awardee_or_recipient_uniqu': 'awardee_or_recipient_uniqu', 'legal_business_name': 'legal_business_name', 'ultimate_parent_unique_ide': 'ultimate_parent_unique_ide', 'ultimate_parent_legal_enti': 'ultimate_parent_legal_enti', 'broker_duns_id': 'duns_id' }, value_map={'update_date': update_date}, as_dict=False, save=False) new_records.append(new_record) DUNS.objects.bulk_create(new_records)
def add_duns(self, new_duns, update_date): logger.info("Adding {} duns records".format(len(new_duns))) new_records = [] for row in new_duns: new_record = load_data_into_model( DUNS(), row, field_map={ "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu", "legal_business_name": "legal_business_name", "ultimate_parent_unique_ide": "ultimate_parent_unique_ide", "ultimate_parent_legal_enti": "ultimate_parent_legal_enti", "broker_duns_id": "duns_id", }, value_map={"update_date": update_date}, as_dict=False, save=False, ) new_records.append(new_record) DUNS.objects.bulk_create(new_records)
def load_transaction_fpds(self, fpds_broker_data, total_rows): logger.info('Starting bulk loading for FPDS data') start_time = datetime.now() for index, row in enumerate(fpds_broker_data, 1): if not (index % 10000): logger.info('Transaction FPDS: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) fpds_instance_data = load_data_into_model( TransactionFPDS(), # thrown away row, as_dict=True) fpds_instance = TransactionFPDS(**fpds_instance_data) fpds_instance.transaction = transaction_normalized_bulk[index - 1] fpds_bulk.append(fpds_instance) logger.info('Bulk creating Transaction FPDS (batch_size: {})...'.format(BATCH_SIZE)) TransactionFPDS.objects.bulk_create(fpds_bulk, batch_size=BATCH_SIZE)
def load_legal_entity(self, fabs_broker_data, total_rows): start_time = datetime.now() for index, row in enumerate(fabs_broker_data, 1): if not (index % 10000): logger.info('Legal Entity: Loading row {} of {} ({})'.format( str(index), str(total_rows), datetime.now() - start_time)) recipient_name = row['awardee_or_recipient_legal'] if recipient_name is None: recipient_name = '' recipient_unique_id = row['awardee_or_recipient_uniqu'] if recipient_unique_id is None: recipient_unique_id = '' lookup_key = (recipient_unique_id, recipient_name) legal_entity = self.le_map.get(lookup_key) if not legal_entity: legal_entity = LegalEntity( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name) legal_entity = load_data_into_model( legal_entity, row, value_map={"location": lel_bulk[index - 1]}, save=False) LegalEntity.update_business_type_categories(legal_entity) self.le_map[lookup_key] = legal_entity legal_entity_bulk.append(legal_entity) legal_entity_lookup.append(legal_entity) logger.info('Bulk creating Legal Entities (batch_size: {})...'.format( BATCH_SIZE)) LegalEntity.objects.bulk_create(legal_entity_bulk, batch_size=BATCH_SIZE)
def _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped_tas, submission_attributes, reverse): save_manager = BulkCreateManager(FinancialAccountsByAwards) for index, row in enumerate(certified_award_financial, 1): if not (index % 1000): logger.info( f"C File Load: Loading row {index:,} of {total_rows:,} ({datetime.now() - start_time})" ) upper_case_dict_values(row) # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get("tas_id")) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue award_financial_data = FinancialAccountsByAwards() value_map_faba = { "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, "treasury_account": treasury_account, "object_class": row.get("object_class"), "program_activity": row.get("program_activity"), "disaster_emergency_fund": get_disaster_emergency_fund(row), "distinct_award_key": create_distinct_award_key(row), } save_manager.append( load_data_into_model(award_financial_data, row, value_map=value_map_faba, save=False, reverse=reverse)) save_manager.save_stragglers()
def load_legal_entity(self, fpds_broker_data, total_rows): start_time = datetime.now() for index, row in enumerate(fpds_broker_data, 1): if not (index % 10000): logger.info('Legal Entity: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) recipient_name = row['awardee_or_recipient_legal'] if recipient_name is None: recipient_name = '' recipient_unique_id = row['awardee_or_recipient_uniqu'] if recipient_unique_id is None: recipient_unique_id = '' lookup_key = (recipient_unique_id, recipient_name) legal_entity = self.le_map.get(lookup_key) if not legal_entity: legal_entity = LegalEntity( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name ) legal_entity = load_data_into_model( legal_entity, row, value_map={"location": lel_bulk[index - 1]}, save=False) LegalEntity.update_business_type_categories(legal_entity) self.le_map[lookup_key] = legal_entity legal_entity_bulk.append(legal_entity) legal_entity_lookup.append(legal_entity) logger.info('Bulk creating Legal Entities (batch_size: {})...'.format(BATCH_SIZE)) LegalEntity.objects.bulk_create(legal_entity_bulk, batch_size=BATCH_SIZE)
def load_transaction_normalized(self, fpds_broker_data, total_rows): start_time = datetime.now() for index, row in enumerate(fpds_broker_data, 1): if not (index % 10000): logger.info('Transaction Normalized: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) parent_txn_value_map = { "award": award_lookup[index - 1], "awarding_agency": awarding_agency_list[index - 1], "funding_agency": funding_agency_list[index - 1], "recipient": legal_entity_lookup[index - 1], "place_of_performance": pop_bulk[index - 1], "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), "last_modified_date": row['last_modified'] } contract_field_map = { "type": "contract_award_type", "description": "award_description" } transaction_normalized = load_data_into_model( TransactionNormalized(), row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=False, save=False) transaction_normalized.fiscal_year = fy(transaction_normalized.action_date) transaction_normalized_bulk.append(transaction_normalized) logger.info('Bulk creating Transaction Normalized (batch_size: {})...'.format(BATCH_SIZE)) TransactionNormalized.objects.bulk_create(transaction_normalized_bulk, batch_size=BATCH_SIZE)
def load_transaction_fabs(self, fabs_broker_data, total_rows): logger.info('Starting bulk loading for FABS data') start_time = datetime.now() for index, row in enumerate(fabs_broker_data, 1): if not (index % 10000): logger.info( 'Transaction FABS: Loading row {} of {} ({})'.format( str(index), str(total_rows), datetime.now() - start_time)) fabs_instance_data = load_data_into_model( TransactionFABS(), # thrown away row, as_dict=True) fabs_instance = TransactionFABS(**fabs_instance_data) fabs_instance.transaction = transaction_normalized_bulk[index - 1] fabs_bulk.append(fabs_instance) logger.info( 'Bulk creating Transaction FABS (batch_size: {})...'.format( BATCH_SIZE)) TransactionFABS.objects.bulk_create(fabs_bulk, batch_size=BATCH_SIZE)
def get_submission_attributes(broker_submission_id, submission_data): """ For a specified broker submission, return the existing corresponding usaspending submission record or create and return a new one. """ # check if we already have an entry for this broker submission id; if not, create one submission_attributes, created = SubmissionAttributes.objects.get_or_create( broker_submission_id=broker_submission_id) if created: # this is the first time we're loading this broker submission logger.info( "Creating broker submission id {}".format(broker_submission_id)) else: # we've already loaded this broker submission, so delete it before reloading if there's another submission that # references this one as a "previous submission" do not proceed. # TODO: now that we're chaining submissions together, get clarification on what should happen when a submission # in the middle of the chain is deleted TasProgramActivityObjectClassQuarterly.refresh_downstream_quarterly_numbers( submission_attributes.submission_id) logger.info( "Broker submission id {} already exists. It will be deleted.". format(broker_submission_id)) call_command("rm_submission", broker_submission_id) logger.info("Merging CGAC and FREC columns") submission_data["toptier_code"] = (submission_data["cgac_code"] if submission_data["cgac_code"] else submission_data["frec_code"]) # Find the previous submission for this CGAC and fiscal year (if there is one) previous_submission = get_previous_submission( submission_data["toptier_code"], submission_data["reporting_fiscal_year"], submission_data["reporting_fiscal_period"], ) # if another submission lists the previous submission as its previous submission, set to null and update later potential_conflicts = [] if previous_submission: potential_conflicts = SubmissionAttributes.objects.filter( previous_submission=previous_submission) if potential_conflicts: logger.info( "==== ATTENTION! Previous Submission ID Conflict Detected ====" ) for conflict in potential_conflicts: logger.info( "Temporarily setting {}'s Previous Submission ID from {} to null" .format(conflict, previous_submission.submission_id)) conflict.previous_submission = None conflict.save() # Update and save submission attributes field_map = { "reporting_period_start": "reporting_start_date", "reporting_period_end": "reporting_end_date", "quarter_format_flag": "is_quarter_format", } # Create our value map - specific data to load value_map = { "broker_submission_id": broker_submission_id, "reporting_fiscal_quarter": get_fiscal_quarter(submission_data["reporting_fiscal_period"]), "previous_submission": previous_submission, # pull in broker's last update date to use as certified date "certified_date": submission_data["updated_at"].date() if type(submission_data["updated_at"]) == datetime else None, } new_submission = load_data_into_model(submission_attributes, submission_data, field_map=field_map, value_map=value_map, save=True) # If there were any submissions which were temporarily modified, reassign the submission for conflict in potential_conflicts: remapped_previous = get_previous_submission( conflict.toptier_code, conflict.reporting_fiscal_year, conflict.reporting_fiscal_period) logger.info( "New Previous Submission ID for Submission ID {} permanently mapped to {} " .format(conflict.submission_id, remapped_previous)) conflict.previous_submission = remapped_previous conflict.save() return new_submission
def insert_new_fpds(self, to_insert, total_rows): place_of_performance_field_map = { "location_country_code": "place_of_perform_country_c", "country_name": "place_of_perf_country_desc", "state_code": "place_of_performance_state", "state_name": "place_of_perfor_state_desc", "city_name": "place_of_perform_city_name", "county_name": "place_of_perform_county_na", "county_code": "place_of_perform_county_co", "zip_4a": "place_of_performance_zip4a", "congressional_code": "place_of_performance_congr", "zip_last4": "place_of_perform_zip_last4", "zip5": "place_of_performance_zip5", } legal_entity_location_field_map = { "location_country_code": "legal_entity_country_code", "country_name": "legal_entity_country_name", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_descrip", "city_name": "legal_entity_city_name", "county_name": "legal_entity_county_name", "county_code": "legal_entity_county_code", "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "zip4": "legal_entity_zip4", "congressional_code": "legal_entity_congressional", "zip_last4": "legal_entity_zip_last4", "zip5": "legal_entity_zip5", } for index, row in enumerate(to_insert, 1): upper_case_dict_values(row) # Create new LegalEntityLocation and LegalEntity from the row data legal_entity_location = create_location( legal_entity_location_field_map, row, {"recipient_flag": True, "is_fpds": True} ) recipient_name = row["awardee_or_recipient_legal"] legal_entity = LegalEntity.objects.create( recipient_unique_id=row["awardee_or_recipient_uniqu"], recipient_name=recipient_name if recipient_name is not None else "", ) legal_entity_value_map = { "location": legal_entity_location, "business_categories": get_business_categories(row=row, data_type="fpds"), "is_fpds": True, } set_legal_entity_boolean_fields(row) legal_entity = load_data_into_model(legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location = create_location(place_of_performance_field_map, row, {"place_of_performance_flag": True}) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only(row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only(row["funding_sub_tier_agency_co"]) # Generate the unique Award ID # "CONT_AW_" + agency_id + referenced_idv_agency_iden + piid + parent_award_id generated_unique_id = ( "CONT_AW_" + (row["agency_id"] if row["agency_id"] else "-NONE-") + "_" + (row["referenced_idv_agency_iden"] if row["referenced_idv_agency_iden"] else "-NONE-") + "_" + (row["piid"] if row["piid"] else "-NONE-") + "_" + (row["parent_award_id"] if row["parent_award_id"] else "-NONE-") ) # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=generated_unique_id, piid=row["piid"] ) award.parent_award_piid = row.get("parent_award_id") award.save() # Append row to list of Awards updated AWARD_UPDATE_ID_LIST.append(award.id) if row["last_modified"] and len(str(row["last_modified"])) == len("YYYY-MM-DD HH:MM:SS"): # 19 characters dt_fmt = "%Y-%m-%d %H:%M:%S" else: dt_fmt = "%Y-%m-%d %H:%M:%S.%f" # try using this even if last_modified isn't a valid string try: last_mod_date = datetime.strptime(str(row["last_modified"]), dt_fmt).date() except ValueError: # handle odd-string formats and NULLs from the upstream FPDS-NG system info_message = "Invalid value '{}' does not match: '{}'".format(row["last_modified"], dt_fmt) logger.info(info_message) last_mod_date = None award_type, award_type_desc = award_types(row) parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row["period_of_performance_star"]), "period_of_performance_current_end_date": format_date(row["period_of_performance_curr"]), "action_date": format_date(row["action_date"]), "last_modified_date": last_mod_date, "transaction_unique_id": row["detached_award_proc_unique"], "generated_unique_award_id": generated_unique_id, "is_fpds": True, "type": award_type, "type_description": award_type_desc, } contract_field_map = {"description": "award_description"} transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=True, ) contract_instance = load_data_into_model(TransactionFPDS(), row, as_dict=True) # thrown away detached_award_proc_unique = contract_instance["detached_award_proc_unique"] unique_fpds = TransactionFPDS.objects.filter(detached_award_proc_unique=detached_award_proc_unique) if unique_fpds.first(): transaction_normalized_dict["update_date"] = datetime.now(timezone.utc) transaction_normalized_dict["fiscal_year"] = fy(transaction_normalized_dict["action_date"]) # update TransactionNormalized TransactionNormalized.objects.filter(id=unique_fpds.first().transaction.id).update( **transaction_normalized_dict ) # update TransactionFPDS unique_fpds.update(**contract_instance) else: # create TransactionNormalized transaction = TransactionNormalized(**transaction_normalized_dict) transaction.save() # create TransactionFPDS transaction_fpds = TransactionFPDS(transaction=transaction, **contract_instance) transaction_fpds.save() # Update legal entity to map back to transaction legal_entity.transaction_unique_id = detached_award_proc_unique legal_entity.save()
def insert_new_fabs(self, to_insert): place_of_performance_field_map = { "location_country_code": "place_of_perform_country_c", "country_name": "place_of_perform_country_n", "state_code": "place_of_perfor_state_code", "state_name": "place_of_perform_state_nam", "city_name": "place_of_performance_city", "county_name": "place_of_perform_county_na", "county_code": "place_of_perform_county_co", "foreign_location_description": "place_of_performance_forei", "zip_4a": "place_of_performance_zip4a", "congressional_code": "place_of_performance_congr", "performance_code": "place_of_performance_code", "zip_last4": "place_of_perform_zip_last4", "zip5": "place_of_performance_zip5", } legal_entity_location_field_map = { "location_country_code": "legal_entity_country_code", "country_name": "legal_entity_country_name", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_name", "city_name": "legal_entity_city_name", "city_code": "legal_entity_city_code", "county_name": "legal_entity_county_name", "county_code": "legal_entity_county_code", "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "foreign_location_description": "legal_entity_foreign_descr", "congressional_code": "legal_entity_congressional", "zip_last4": "legal_entity_zip_last4", "zip5": "legal_entity_zip5", "foreign_postal_code": "legal_entity_foreign_posta", "foreign_province": "legal_entity_foreign_provi", "foreign_city_name": "legal_entity_foreign_city", } for row in to_insert: upper_case_dict_values(row) # Create new LegalEntityLocation and LegalEntity from the row data legal_entity_location = create_location( legal_entity_location_field_map, row, {"recipient_flag": True}) recipient_name = row['awardee_or_recipient_legal'] legal_entity = LegalEntity.objects.create( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name if recipient_name is not None else "", parent_recipient_unique_id=row['ultimate_parent_unique_ide'], ) legal_entity_value_map = { "location": legal_entity_location, "business_categories": get_business_categories(row=row, data_type='fabs'), "business_types_description": row['business_types_desc'], } legal_entity = load_data_into_model( legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location = create_location(place_of_performance_field_map, row, {"place_of_performance_flag": True}) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only( row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only( row["funding_sub_tier_agency_co"]) # Generate the unique Award ID # "ASST_AW_" + awarding_sub_tier_agency_c + fain + uri # this will raise an exception if the cast to an int fails, that's ok since we don't want to process # non-numeric record type values record_type_int = int(row['record_type']) if record_type_int == 1: uri = row['uri'] if row['uri'] else '-NONE-' fain = '-NONE-' elif record_type_int in (2, 3): uri = '-NONE-' fain = row['fain'] if row['fain'] else '-NONE-' else: msg = "Invalid record type encountered for the following afa_generated_unique record: {}" raise Exception(msg.format(row['afa_generated_unique'])) astac = row["awarding_sub_tier_agency_c"] if row[ "awarding_sub_tier_agency_c"] else "-NONE-" generated_unique_id = "ASST_AW_{}_{}_{}".format(astac, fain, uri) # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=generated_unique_id, fain=row['fain'], uri=row['uri'], record_type=row['record_type'], ) award.save() # Append row to list of Awards updated AWARD_UPDATE_ID_LIST.append(award.id) try: last_mod_date = datetime.strptime(str( row['modified_at']), "%Y-%m-%d %H:%M:%S.%f").date() except ValueError: last_mod_date = datetime.strptime(str(row['modified_at']), "%Y-%m-%d %H:%M:%S").date() parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), "last_modified_date": last_mod_date, "type_description": row['assistance_type_desc'], "transaction_unique_id": row['afa_generated_unique'], "generated_unique_award_id": generated_unique_id, } fad_field_map = { "type": "assistance_type", "description": "award_description", "funding_amount": "total_funding_amount", } transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=fad_field_map, value_map=parent_txn_value_map, as_dict=True, ) financial_assistance_data = load_data_into_model( TransactionFABS(), row, as_dict=True) # thrown away afa_generated_unique = financial_assistance_data[ 'afa_generated_unique'] unique_fabs = TransactionFABS.objects.filter( afa_generated_unique=afa_generated_unique) if unique_fabs.first(): transaction_normalized_dict["update_date"] = datetime.now( timezone.utc) transaction_normalized_dict["fiscal_year"] = fy( transaction_normalized_dict["action_date"]) # Update TransactionNormalized TransactionNormalized.objects.filter( id=unique_fabs.first().transaction.id).update( **transaction_normalized_dict) # Update TransactionFABS unique_fabs.update(**financial_assistance_data) else: # Create TransactionNormalized transaction = TransactionNormalized( **transaction_normalized_dict) transaction.save() # Create TransactionFABS transaction_fabs = TransactionFABS(transaction=transaction, **financial_assistance_data) transaction_fabs.save() # Update legal entity to map back to transaction legal_entity.transaction_unique_id = afa_generated_unique legal_entity.save()
def update_transaction_assistance(db_cursor, fiscal_year=None, page=1, limit=500000): # logger.info("Getting IDs for what's currently in the DB...") # current_ids = TransactionFABS.objects # # if fiscal_year: # current_ids = current_ids.filter(action_date__fy=fiscal_year) # # current_ids = current_ids.values_list('published_award_financial_assistance_id', flat=True) query = "SELECT * FROM published_award_financial_assistance" arguments = [] fy_begin = '10/01/' + str(fiscal_year - 1) fy_end = '09/30/' + str(fiscal_year) if fiscal_year: if arguments: query += " AND" else: query += " WHERE" query += ' action_date::Date BETWEEN %s AND %s' arguments += [fy_begin] arguments += [fy_end] query += ' ORDER BY published_award_financial_assistance_id LIMIT %s OFFSET %s' arguments += [limit, (page-1)*limit] logger.info("Executing query on Broker DB => " + query % (arguments[0], arguments[1], arguments[2], arguments[3])) db_cursor.execute(query, arguments) logger.info("Running dictfetchall on db_cursor") award_financial_assistance_data = dictfetchall(db_cursor) legal_entity_location_field_map = { "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "city_name": "legal_entity_city_name", "congressional_code": "legal_entity_congressional", "county_code": "legal_entity_county_code", "county_name": "legal_entity_county_name", "foreign_city_name": "legal_entity_foreign_city", "foreign_postal_code": "legal_entity_foreign_posta", "foreign_province": "legal_entity_foreign_provi", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_name", "zip5": "legal_entity_zip5", "zip_last4": "legal_entity_zip_last4", "location_country_code": "legal_entity_country_code" } place_of_performance_field_map = { "city_name": "place_of_performance_city", "performance_code": "place_of_performance_code", "congressional_code": "place_of_performance_congr", "county_name": "place_of_perform_county_na", "foreign_location_description": "place_of_performance_forei", "state_name": "place_of_perform_state_nam", "zip4": "place_of_performance_zip4a", "location_country_code": "place_of_perform_country_c" } fad_field_map = { "type": "assistance_type", "description": "award_description", } logger.info("Getting total rows") # rows_loaded = len(current_ids) total_rows = len(award_financial_assistance_data) # - rows_loaded logger.info("Processing " + str(total_rows) + " rows of assistance data") # skip_count = 0 # ROW ITERATION STARTS HERE lel_bulk = [] pop_bulk = [] legal_entity_bulk = [] award_bulk = [] transaction_assistance_bulk = [] transaction_normalized_bulk = [] logger.info('Getting legal entity location objects for {} rows...'.format(len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # Recipient flag is true for LeL legal_entity_location = get_or_create_location( legal_entity_location_field_map, row, {"recipient_flag": True}, save=False ) lel_bulk.append(legal_entity_location) logger.info('Bulk creating {} legal entity location rows...'.format(len(lel_bulk))) try: Location.objects.bulk_create(lel_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info('Getting place of performance objects for {} rows...'.format(len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # Place of Performance flag is true for PoP pop_location = get_or_create_location( place_of_performance_field_map, row, {"place_of_performance_flag": True}, save=False ) pop_bulk.append(pop_location) logger.info('Bulk creating {} place of performance rows...'.format(len(pop_bulk))) try: Location.objects.bulk_create(pop_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info('Getting legal entity objects for {} rows...'.format(len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): recipient_name = row.get('awardee_or_recipient_legal', '') legal_entity = LegalEntity.objects.filter(recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name).first() if legal_entity is None: legal_entity = LegalEntity(recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name) legal_entity_value_map = { "location": lel_bulk[index - 1], } legal_entity = load_data_into_model(legal_entity, row, value_map=legal_entity_value_map, save=False) legal_entity_bulk.append(legal_entity) logger.info('Bulk creating {} legal entity rows...'.format(len(legal_entity_bulk))) try: LegalEntity.objects.bulk_create(legal_entity_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') awarding_agency_list = [] funding_agency_list = [] logger.info('Getting award objects for {} rows...'.format(len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # If awarding toptier agency code (aka CGAC) is not supplied on the D2 record, # use the sub tier code to look it up. This code assumes that all incoming # records will supply an awarding subtier agency code if row['awarding_agency_code'] is None or len(row['awarding_agency_code'].strip()) < 1: awarding_subtier_agency_id = subtier_agency_map[row["awarding_sub_tier_agency_c"]] awarding_toptier_agency_id = subtier_to_agency_map[awarding_subtier_agency_id]['toptier_agency_id'] awarding_cgac_code = toptier_agency_map[awarding_toptier_agency_id] row['awarding_agency_code'] = awarding_cgac_code # If funding toptier agency code (aka CGAC) is empty, try using the sub # tier funding code to look it up. Unlike the awarding agency, we can't # assume that the funding agency subtier code will always be present. if row['funding_agency_code'] is None or len(row['funding_agency_code'].strip()) < 1: funding_subtier_agency_id = subtier_agency_map.get(row["funding_sub_tier_agency_co"]) if funding_subtier_agency_id is not None: funding_toptier_agency_id = \ subtier_to_agency_map[funding_subtier_agency_id]['toptier_agency_id'] funding_cgac_code = toptier_agency_map[funding_toptier_agency_id] else: funding_cgac_code = None row['funding_agency_code'] = funding_cgac_code # Find the award that this award transaction belongs to. If it doesn't exist, create it. awarding_agency = Agency.get_by_toptier_subtier( row['awarding_agency_code'], row["awarding_sub_tier_agency_c"] ) funding_agency = Agency.get_by_toptier_subtier( row['funding_agency_code'], row["funding_sub_tier_agency_co"] ) awarding_agency_list.append(awarding_agency) funding_agency_list.append(funding_agency) # award.save() is called in Award.get_or_create_summary_award by default created, award = Award.get_or_create_summary_award( awarding_agency=awarding_agency, fain=row.get('fain'), uri=row.get('uri'), save=False ) award_bulk.append(award) award_update_id_list.append(award.id) logger.info('Bulk creating {} award rows...'.format(len(award_bulk))) try: Award.objects.bulk_create(award_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info('Getting transaction_normalized for {} rows...'.format(len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): parent_txn_value_map = { "award": award_bulk[index - 1], "awarding_agency": awarding_agency_list[index - 1], "funding_agency": funding_agency_list[index - 1], "recipient": legal_entity_bulk[index - 1], "place_of_performance": pop_bulk[index - 1], "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), } transaction_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=fad_field_map, value_map=parent_txn_value_map, as_dict=True) transaction_normalized = TransactionNormalized.get_or_create_transaction(**transaction_dict) transaction_normalized.fiscal_year = fy(transaction_normalized.action_date) transaction_normalized_bulk.append(transaction_normalized) logger.info('Bulk creating {} TransactionNormalized rows...'.format(len(transaction_normalized_bulk))) try: TransactionNormalized.objects.bulk_create(transaction_normalized_bulk) except IntegrityError: logger.info('Tried and failed to insert duplicate transaction_normalized row. Continuing... ') for index, row in enumerate(award_financial_assistance_data, 1): financial_assistance_data = load_data_into_model( TransactionFABS(), # thrown away row, as_dict=True) transaction_assistance = TransactionFABS(transaction=transaction_normalized_bulk[index - 1], **financial_assistance_data) transaction_assistance_bulk.append(transaction_assistance) logger.info('Bulk creating TransactionFABS rows...') try: TransactionFABS.objects.bulk_create(transaction_assistance_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ')
def update_transaction_contract(db_cursor, fiscal_year=None, page=1, limit=500000): # logger.info("Getting IDs for what's currently in the DB...") # current_ids = TransactionFPDS.objects # # if fiscal_year: # current_ids = current_ids.filter(action_date__fy=fiscal_year) # # current_ids = current_ids.values_list('detached_award_procurement_id', flat=True) query = "SELECT * FROM detached_award_procurement" arguments = [] fy_begin = '10/01/' + str(fiscal_year - 1) fy_end = '09/30/' + str(fiscal_year) if fiscal_year: if arguments: query += " AND" else: query += " WHERE" query += ' action_date::Date BETWEEN %s AND %s' arguments += [fy_begin] arguments += [fy_end] query += ' ORDER BY detached_award_procurement_id LIMIT %s OFFSET %s' arguments += [limit, (page-1)*limit] logger.info("Executing query on Broker DB => " + query % (arguments[0], arguments[1], arguments[2], arguments[3])) db_cursor.execute(query, arguments) logger.info("Running dictfetchall on db_cursor") procurement_data = dictfetchall(db_cursor) legal_entity_location_field_map = { "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "location_country_code": "legal_entity_country_code", "city_name": "legal_entity_city_name", "congressional_code": "legal_entity_congressional", "state_code": "legal_entity_state_code", "zip4": "legal_entity_zip4" } legal_entity_location_value_map = { "recipient_flag": True } place_of_performance_field_map = { # not sure place_of_performance_locat maps exactly to city name # "city_name": "place_of_performance_locat", # location id doesn't mean it's a city. Can't use this mapping "congressional_code": "place_of_performance_congr", "state_code": "place_of_performance_state", "zip4": "place_of_performance_zip4a", "location_country_code": "place_of_perform_country_c" } place_of_performance_value_map = { "place_of_performance_flag": True } contract_field_map = { "type": "contract_award_type", "description": "award_description" } logger.info("Getting total rows") # rows_loaded = len(current_ids) total_rows = len(procurement_data) # - rows_loaded logger.info("Processing " + str(total_rows) + " rows of procurement data") # skip_count = 0 start_time = datetime.now() for index, row in enumerate(procurement_data, 1): with db_transaction.atomic(): # if TransactionFPDS.objects.values('detached_award_procurement_id').\ # filter(detached_award_procurement_id=str(row['detached_award_procurement_id'])).first(): # skip_count += 1 # # if not (skip_count % 100): # logger.info('Skipped {} records so far'.format(str(skip_count))) if not (index % 100): logger.info('D1 File Load: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) recipient_name = row['awardee_or_recipient_legal'] if recipient_name is None: recipient_name = "" legal_entity_location, created = get_or_create_location( legal_entity_location_field_map, row, copy(legal_entity_location_value_map) ) # Create the legal entity if it doesn't exist legal_entity, created = LegalEntity.objects.get_or_create( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name ) if created: legal_entity_value_map = { "location": legal_entity_location, } legal_entity = load_data_into_model(legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location, created = get_or_create_location( place_of_performance_field_map, row, copy(place_of_performance_value_map)) # If awarding toptier agency code (aka CGAC) is not supplied on the D2 record, # use the sub tier code to look it up. This code assumes that all incoming # records will supply an awarding subtier agency code if row['awarding_agency_code'] is None or len(row['awarding_agency_code'].strip()) < 1: awarding_subtier_agency_id = subtier_agency_map[row["awarding_sub_tier_agency_c"]] awarding_toptier_agency_id = subtier_to_agency_map[awarding_subtier_agency_id]['toptier_agency_id'] awarding_cgac_code = toptier_agency_map[awarding_toptier_agency_id] row['awarding_agency_code'] = awarding_cgac_code # If funding toptier agency code (aka CGAC) is empty, try using the sub # tier funding code to look it up. Unlike the awarding agency, we can't # assume that the funding agency subtier code will always be present. if row['funding_agency_code'] is None or len(row['funding_agency_code'].strip()) < 1: funding_subtier_agency_id = subtier_agency_map.get(row["funding_sub_tier_agency_co"]) if funding_subtier_agency_id is not None: funding_toptier_agency_id = \ subtier_to_agency_map[funding_subtier_agency_id]['toptier_agency_id'] funding_cgac_code = toptier_agency_map[funding_toptier_agency_id] else: funding_cgac_code = None row['funding_agency_code'] = funding_cgac_code # Find the award that this award transaction belongs to. If it doesn't exist, create it. awarding_agency = Agency.get_by_toptier_subtier( row['awarding_agency_code'], row["awarding_sub_tier_agency_c"] ) created, award = Award.get_or_create_summary_award( awarding_agency=awarding_agency, piid=row.get('piid'), fain=row.get('fain'), uri=row.get('uri'), parent_award_piid=row.get('parent_award_id')) award.save() award_update_id_list.append(award.id) award_contract_update_id_list.append(award.id) parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": Agency.get_by_toptier_subtier(row['funding_agency_code'], row["funding_sub_tier_agency_co"]), "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), } transaction_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=True) transaction = TransactionNormalized.get_or_create_transaction(**transaction_dict) transaction.save() contract_instance = load_data_into_model( TransactionFPDS(), # thrown away row, as_dict=True) transaction_contract = TransactionFPDS(transaction=transaction, **contract_instance) # catch exception and do nothing if we see # "django.db.utils.IntegrityError: duplicate key value violates unique constraint" try: transaction_contract.save() except IntegrityError: pass
def insert_new_fabs(to_insert): fabs_normalized_field_map = { "type": "assistance_type", "description": "award_description", "funding_amount": "total_funding_amount", } fabs_field_map = { "officer_1_name": "high_comp_officer1_full_na", "officer_1_amount": "high_comp_officer1_amount", "officer_2_name": "high_comp_officer2_full_na", "officer_2_amount": "high_comp_officer2_amount", "officer_3_name": "high_comp_officer3_full_na", "officer_3_amount": "high_comp_officer3_amount", "officer_4_name": "high_comp_officer4_full_na", "officer_4_amount": "high_comp_officer4_amount", "officer_5_name": "high_comp_officer5_full_na", "officer_5_amount": "high_comp_officer5_amount", } update_award_ids = [] for row in to_insert: upper_case_dict_values(row) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only( row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only( row["funding_sub_tier_agency_co"]) # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=row["unique_award_key"], fain=row["fain"], uri=row["uri"], record_type=row["record_type"], ) award.save() # Append row to list of Awards updated update_award_ids.append(award.id) try: last_mod_date = datetime.strptime(str(row["modified_at"]), "%Y-%m-%d %H:%M:%S.%f").date() except ValueError: last_mod_date = datetime.strptime(str(row["modified_at"]), "%Y-%m-%d %H:%M:%S").date() parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "period_of_performance_start_date": format_date(row["period_of_performance_star"]), "period_of_performance_current_end_date": format_date(row["period_of_performance_curr"]), "action_date": format_date(row["action_date"]), "last_modified_date": last_mod_date, "type_description": row["assistance_type_desc"], "transaction_unique_id": row["afa_generated_unique"], "business_categories": get_business_categories(row=row, data_type="fabs"), } transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=fabs_normalized_field_map, value_map=parent_txn_value_map, as_dict=True, ) financial_assistance_data = load_data_into_model( TransactionFABS(), row, field_map=fabs_field_map, as_dict=True # thrown away ) # Hack to cut back on the number of warnings dumped to the log. financial_assistance_data["updated_at"] = cast_datetime_to_utc( financial_assistance_data["updated_at"]) financial_assistance_data["created_at"] = cast_datetime_to_utc( financial_assistance_data["created_at"]) financial_assistance_data["modified_at"] = cast_datetime_to_utc( financial_assistance_data["modified_at"]) afa_generated_unique = financial_assistance_data[ "afa_generated_unique"] unique_fabs = TransactionFABS.objects.filter( afa_generated_unique=afa_generated_unique) if unique_fabs.first(): transaction_normalized_dict["update_date"] = datetime.now( timezone.utc) transaction_normalized_dict["fiscal_year"] = fy( transaction_normalized_dict["action_date"]) # Update TransactionNormalized TransactionNormalized.objects.filter( id=unique_fabs.first().transaction.id).update( **transaction_normalized_dict) # Update TransactionFABS unique_fabs.update(**financial_assistance_data) else: # Create TransactionNormalized transaction_normalized = TransactionNormalized( **transaction_normalized_dict) transaction_normalized.save() # Create TransactionFABS transaction_fabs = TransactionFABS( transaction=transaction_normalized, **financial_assistance_data) transaction_fabs.save() return update_award_ids
def insert_new_fpds(self, to_insert, total_rows): logger.info('Starting insertion of new FPDS data') place_of_performance_field_map = { "location_country_code": "place_of_perform_country_c", "country_name": "place_of_perf_country_desc", "state_code": "place_of_performance_state", "state_name": "place_of_perfor_state_desc", "city_name": "place_of_perform_city_name", "county_name": "place_of_perform_county_na", "county_code": "place_of_perform_county_co", "zip_4a": "place_of_performance_zip4a", "congressional_code": "place_of_performance_congr", "zip_last4": "place_of_perform_zip_last4", "zip5": "place_of_performance_zip5" } legal_entity_location_field_map = { "location_country_code": "legal_entity_country_code", "country_name": "legal_entity_country_name", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_descrip", "city_name": "legal_entity_city_name", "county_name": "legal_entity_county_name", "county_code": "legal_entity_county_code", "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "zip4": "legal_entity_zip4", "congressional_code": "legal_entity_congressional", "zip_last4": "legal_entity_zip_last4", "zip5": "legal_entity_zip5" } start_time = datetime.now() for index, row in enumerate(to_insert, 1): if not (index % 1000): logger.info( 'Inserting Stale FPDS: Inserting row {} of {} ({})'.format( str(index), str(total_rows), datetime.now() - start_time)) for key in row: if isinstance(row[key], str): row[key] = row[key].upper() # Create new LegalEntityLocation and LegalEntity from the row data legal_entity_location = create_location( legal_entity_location_field_map, row, { "recipient_flag": True, "is_fpds": True }) recipient_name = row['awardee_or_recipient_legal'] legal_entity = LegalEntity.objects.create( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name if recipient_name is not None else "") legal_entity_value_map = { "location": legal_entity_location, "business_categories": get_business_categories(row=row, data_type='fpds'), "is_fpds": True } set_legal_entity_boolean_fields(row) legal_entity = load_data_into_model( legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location = create_location(place_of_performance_field_map, row, {"place_of_performance_flag": True}) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only( row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only( row["funding_sub_tier_agency_co"]) # Generate the unique Award ID # "CONT_AW_" + agency_id + referenced_idv_agency_iden + piid + parent_award_id generated_unique_id = 'CONT_AW_' + (row['agency_id'] if row['agency_id'] else '-NONE-') + '_' + \ (row['referenced_idv_agency_iden'] if row['referenced_idv_agency_iden'] else '-NONE-') + '_' + \ (row['piid'] if row['piid'] else '-NONE-') + '_' + \ (row['parent_award_id'] if row['parent_award_id'] else '-NONE-') # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=generated_unique_id, piid=row['piid']) award.parent_award_piid = row.get('parent_award_id') award.save() # Append row to list of Awards updated award_update_id_list.append(award.id) try: last_mod_date = datetime.strptime(str( row['last_modified']), "%Y-%m-%d %H:%M:%S.%f").date() except ValueError: last_mod_date = datetime.strptime(str(row['last_modified']), "%Y-%m-%d %H:%M:%S").date() parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), "last_modified_date": last_mod_date, "transaction_unique_id": row['detached_award_proc_unique'], "generated_unique_award_id": generated_unique_id, "is_fpds": True } contract_field_map = { "type": "contract_award_type", "type_description": "contract_award_type_desc", "description": "award_description" } transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=True) contract_instance = load_data_into_model( TransactionFPDS(), # thrown away row, as_dict=True) detached_award_proc_unique = contract_instance[ 'detached_award_proc_unique'] unique_fpds = TransactionFPDS.objects.filter( detached_award_proc_unique=detached_award_proc_unique) if unique_fpds.first(): transaction_normalized_dict["update_date"] = datetime.utcnow() transaction_normalized_dict["fiscal_year"] = fy( transaction_normalized_dict["action_date"]) # update TransactionNormalized TransactionNormalized.objects.filter(id=unique_fpds.first().transaction.id).\ update(**transaction_normalized_dict) # update TransactionFPDS unique_fpds.update(**contract_instance) else: # create TransactionNormalized transaction = TransactionNormalized( **transaction_normalized_dict) transaction.save() # create TransactionFPDS transaction_fpds = TransactionFPDS(transaction=transaction, **contract_instance) transaction_fpds.save()
def load_file_c(submission_attributes, db_cursor, award_financial_frame): """ Process and load file C broker data. Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we # change up the order of file loading if not award_financial_frame.size: logger.warning('No File C (award financial) data found, skipping...') return reverse = re.compile(r'(_(cpe|fyb)$)|^transaction_obligated_amount$') # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} award_financial_frame['txn'] = award_financial_frame.apply(get_award_financial_transaction, axis=1) award_financial_frame['awarding_agency'] = award_financial_frame.apply(get_awarding_agency, axis=1) award_financial_frame['object_class'] = award_financial_frame.apply(get_or_create_object_class_rw, axis=1, logger=logger) award_financial_frame['program_activity'] = award_financial_frame.apply(get_or_create_program_activity, axis=1, submission_attributes=submission_attributes) total_rows = award_financial_frame.shape[0] start_time = datetime.now() awards_touched = [] # format award_financial_frame float_cols = ['transaction_obligated_amou'] award_financial_frame[float_cols] = award_financial_frame[float_cols].fillna(0) award_financial_frame = award_financial_frame.replace({np.nan: None}) for index, row in enumerate(award_financial_frame.to_dict(orient='records'), 1): if not (index % 100): logger.info('C File Load: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) upper_case_dict_values(row) # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get('tas_id'), db_cursor) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # Find a matching transaction record, so we can use its subtier agency information to match to (or create) an # Award record. # Find the award that this award transaction belongs to. If it doesn't exist, create it. filters = {} if row.get('piid'): filters['piid'] = row.get('piid') filters['parent_piid'] = row.get('parent_award_id') else: if row.get('fain') and not row.get('uri'): filters['fain'] = row.get('fain') elif row.get('uri') and not row.get('fain'): filters['uri'] = row.get('uri') else: filters['fain'] = row.get('fain') filters['uri'] = row.get('uri') award = find_matching_award(**filters) if award: awards_touched += [award] award_financial_data = FinancialAccountsByAwards() value_map_faba = { 'award': award, 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end, 'treasury_account': treasury_account, 'object_class': row.get('object_class'), 'program_activity': row.get('program_activity'), } # Still using the cpe|fyb regex compiled above for reverse load_data_into_model(award_financial_data, row, value_map=value_map_faba, save=True, reverse=reverse) awards_cache.clear() for key in skipped_tas: logger.info('Skipped %d rows due to missing TAS: %s', skipped_tas[key]['count'], key) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]['count'] logger.info('Skipped a total of {} TAS rows for File C'.format(total_tas_skipped)) return [award.id for award in awards_touched if award]
def load_file_c(submission_attributes, db_cursor, award_financial_frame): """ Process and load file C broker data. Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ # this matches the file b reverse directive, but am repeating it here # to ensure that we don't overwrite it as we change up the order of # file loading reverse = re.compile(r'(_(cpe|fyb)$)|^transaction_obligated_amount$') award_financial_frame['txn'] = award_financial_frame.apply( get_award_financial_transaction, axis=1) award_financial_frame['awarding_agency'] = award_financial_frame.apply( get_awarding_agency, axis=1) award_financial_frame['object_class'] = award_financial_frame.apply( get_or_create_object_class_rw, axis=1, logger=logger) award_financial_frame['program_activity'] = award_financial_frame.apply( get_or_create_program_activity, axis=1, submission_attributes=submission_attributes) # for row in award_financial_data: for row in award_financial_frame.replace({ np.nan: None }).to_dict(orient='records'): # Check and see if there is an entry for this TAS treasury_account = get_treasury_appropriation_account_tas_lookup( row.get('tas_id'), db_cursor) if treasury_account is None: raise Exception('Could not find appropriation account for TAS: ' + row['tas']) # Find a matching transaction record, so we can use its # subtier agency information to match to (or create) an Award record # Find the award that this award transaction belongs to. If it doesn't exist, create it. created, award = Award.get_or_create_summary_award( awarding_agency=row['awarding_agency'], piid=row.get('piid'), fain=row.get('fain'), uri=row.get('uri'), parent_award_id=row.get('parent_award_id'), use_cache=False) award_financial_data = FinancialAccountsByAwards() value_map = { 'award': award, 'submission': submission_attributes, 'reporting_period_start': submission_attributes.reporting_period_start, 'reporting_period_end': submission_attributes.reporting_period_end, 'treasury_account': treasury_account, 'object_class': row.get('object_class'), 'program_activity': row.get('program_activity'), } # Still using the cpe|fyb regex compiled above for reverse afd = load_data_into_model(award_financial_data, row, value_map=value_map, save=True, reverse=reverse) awards_cache.clear()
def load_locations(self, fpds_broker_data, total_rows, pop_flag=False): start_time = datetime.now() for index, row in enumerate(fpds_broker_data, 1): if not (index % 10000): logger.info('Locations: Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) if pop_flag: location_value_map = {"place_of_performance_flag": True} field_map = pop_field_map else: location_value_map = {'recipient_flag': True} field_map = le_field_map row = canonicalize_location_dict(row) # THIS ASSUMPTION DOES NOT HOLD FOR FPDS SINCE IT DOES NOT HAVE A PLACE OF PERFORMANCE CODE # We can assume that if the country code is blank and the place of performance code is NOT '00FORGN', then # the country code is USA # if pop_flag and not country_code and pop_code != '00FORGN': # row[field_map.get('location_country_code')] = 'USA' # Get country code obj location_country_code_obj = self.country_code_map.get(row[field_map.get('location_country_code')]) # Fix state code periods state_code = row.get(field_map.get('state_code')) if state_code is not None: location_value_map.update({'state_code': state_code.replace('.', '')}) if location_country_code_obj: location_value_map.update({ 'location_country_code': location_country_code_obj, 'country_name': location_country_code_obj.country_name }) if location_country_code_obj.country_code != 'USA': location_value_map.update({ 'state_code': None, 'state_name': None }) else: # no country found for this code location_value_map.update({ 'location_country_code': None, 'country_name': None }) location_instance_data = load_data_into_model( Location(), row, value_map=location_value_map, field_map=field_map, as_dict=True) loc_instance = Location(**location_instance_data) loc_instance.load_city_county_data() loc_instance.fill_missing_state_data() loc_instance.fill_missing_zip5() if pop_flag: pop_bulk.append(loc_instance) else: lel_bulk.append(loc_instance) if pop_flag: logger.info('Bulk creating POP Locations (batch_size: {})...'.format(BATCH_SIZE)) Location.objects.bulk_create(pop_bulk, batch_size=BATCH_SIZE) else: logger.info('Bulk creating LE Locations (batch_size: {})...'.format(BATCH_SIZE)) Location.objects.bulk_create(lel_bulk, batch_size=BATCH_SIZE)
def insert_new_fabs(to_insert): place_of_performance_field_map = { "location_country_code": "place_of_perform_country_c", "country_name": "place_of_perform_country_n", "state_code": "place_of_perfor_state_code", "state_name": "place_of_perform_state_nam", "city_name": "place_of_performance_city", "county_name": "place_of_perform_county_na", "county_code": "place_of_perform_county_co", "foreign_location_description": "place_of_performance_forei", "zip_4a": "place_of_performance_zip4a", "congressional_code": "place_of_performance_congr", "performance_code": "place_of_performance_code", "zip_last4": "place_of_perform_zip_last4", "zip5": "place_of_performance_zip5", } legal_entity_location_field_map = { "location_country_code": "legal_entity_country_code", "country_name": "legal_entity_country_name", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_name", "city_name": "legal_entity_city_name", "city_code": "legal_entity_city_code", "county_name": "legal_entity_county_name", "county_code": "legal_entity_county_code", "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "foreign_location_description": "legal_entity_foreign_descr", "congressional_code": "legal_entity_congressional", "zip_last4": "legal_entity_zip_last4", "zip5": "legal_entity_zip5", "foreign_postal_code": "legal_entity_foreign_posta", "foreign_province": "legal_entity_foreign_provi", "foreign_city_name": "legal_entity_foreign_city", } update_award_ids = [] for row in to_insert: upper_case_dict_values(row) # Create new LegalEntityLocation and LegalEntity from the row data legal_entity_location = create_location(legal_entity_location_field_map, row, {"recipient_flag": True}) recipient_name = row['awardee_or_recipient_legal'] legal_entity = LegalEntity.objects.create( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name if recipient_name is not None else "", parent_recipient_unique_id=row['ultimate_parent_unique_ide'], ) legal_entity_value_map = { "location": legal_entity_location, "business_categories": get_business_categories(row=row, data_type='fabs'), "business_types_description": row['business_types_desc'], } legal_entity = load_data_into_model(legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location = create_location(place_of_performance_field_map, row, {"place_of_performance_flag": True}) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only(row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only(row["funding_sub_tier_agency_co"]) # Generate the unique Award ID # "ASST_AW_" + awarding_sub_tier_agency_c + fain + uri # this will raise an exception if the cast to an int fails, that's ok since we don't want to process # non-numeric record type values record_type_int = int(row['record_type']) if record_type_int == 1: uri = row['uri'] if row['uri'] else '-NONE-' fain = '-NONE-' elif record_type_int in (2, 3): uri = '-NONE-' fain = row['fain'] if row['fain'] else '-NONE-' else: msg = "Invalid record type encountered for the following afa_generated_unique record: {}" raise Exception(msg.format(row['afa_generated_unique'])) astac = row["awarding_sub_tier_agency_c"] if row["awarding_sub_tier_agency_c"] else "-NONE-" generated_unique_id = "ASST_AW_{}_{}_{}".format(astac, fain, uri) # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=generated_unique_id, fain=row['fain'], uri=row['uri'], record_type=row['record_type'], ) award.save() # Append row to list of Awards updated update_award_ids.append(award.id) try: last_mod_date = datetime.strptime(str(row['modified_at']), "%Y-%m-%d %H:%M:%S.%f").date() except ValueError: last_mod_date = datetime.strptime(str(row['modified_at']), "%Y-%m-%d %H:%M:%S").date() parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), "last_modified_date": last_mod_date, "type_description": row['assistance_type_desc'], "transaction_unique_id": row['afa_generated_unique'], "generated_unique_award_id": generated_unique_id, } fad_field_map = { "type": "assistance_type", "description": "award_description", "funding_amount": "total_funding_amount", } transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=fad_field_map, value_map=parent_txn_value_map, as_dict=True, ) financial_assistance_data = load_data_into_model(TransactionFABS(), row, as_dict=True) # thrown away # Hack to cut back on the number of warnings dumped to the log. financial_assistance_data['updated_at'] = cast_datetime_to_utc(financial_assistance_data['updated_at']) financial_assistance_data['created_at'] = cast_datetime_to_utc(financial_assistance_data['created_at']) financial_assistance_data['modified_at'] = cast_datetime_to_utc(financial_assistance_data['modified_at']) afa_generated_unique = financial_assistance_data['afa_generated_unique'] unique_fabs = TransactionFABS.objects.filter(afa_generated_unique=afa_generated_unique) if unique_fabs.first(): transaction_normalized_dict["update_date"] = datetime.now(timezone.utc) transaction_normalized_dict["fiscal_year"] = fy(transaction_normalized_dict["action_date"]) # Update TransactionNormalized TransactionNormalized.objects.filter(id=unique_fabs.first().transaction.id).update( **transaction_normalized_dict ) # Update TransactionFABS unique_fabs.update(**financial_assistance_data) else: # Create TransactionNormalized transaction_normalized = TransactionNormalized(**transaction_normalized_dict) transaction_normalized.save() # Create TransactionFABS transaction_fabs = TransactionFABS(transaction=transaction_normalized, **financial_assistance_data) transaction_fabs.save() # Update legal entity to map back to transaction legal_entity.transaction_unique_id = afa_generated_unique legal_entity.save() return update_award_ids
def csv_tas_loader(self, file_path): field_map = { "treasury_account_identifier": "ACCT_NUM", "allocation_transfer_agency_id": "ATA", "agency_id": "AID", "beginning_period_of_availability": "BPOA", "ending_period_of_availability": "EPOA", "availability_type_code": "A", "main_account_code": "MAIN", "sub_account_code": "SUB", "account_title": "GWA_TAS_NAME", "reporting_agency_id": "Agency AID", "reporting_agency_name": "Agency Name", "budget_bureau_code": "ADMIN_ORG", "budget_bureau_name": "Admin Org Name", "fr_entity_code": "FR Entity Type", "fr_entity_description": "FR Entity Description", "budget_function_code": "Function Code", "budget_function_title": "Function Description", "budget_subfunction_code": "Sub Function Code", "budget_subfunction_title": "Sub Function Description", } value_map = { "data_source": "USA", "tas_rendering_label": self.generate_tas_rendering_label, "awarding_toptier_agency": None, "funding_toptier_agency": None, "internal_start_date": lambda row: datetime.strftime( datetime.strptime(row["DT_TM_ESTAB"], "%m/%d/%Y %H:%M:%S"), "%Y-%m-%d" ), "internal_end_date": lambda row: datetime.strftime( datetime.strptime(row["DT_END"], "%m/%d/%Y %H:%M:%S"), "%Y-%m-%d" ) if row["DT_END"] else None, } with RetrieveFileFromUri(file_path).get_file_object(True) as tas_list_file_object: # Get a total count for print out tas_list_reader = csv.DictReader(tas_list_file_object) total_count = len(list(tas_list_reader)) # Reset the reader back to the beginning of the file tas_list_file_object.seek(0) tas_list_reader = csv.DictReader(tas_list_file_object) for count, row in enumerate(tas_list_reader, 1): for key, value in row.items(): row[key] = value.strip() or None # Check to see if we need to update or create a TreasuryAppropriationAccount record current_record = TreasuryAppropriationAccount.objects.filter( treasury_account_identifier=row["ACCT_NUM"] ).first() taa_instance = current_record or TreasuryAppropriationAccount() # Don't load Financing TAS if row["financial_indicator_type2"] == "F": if taa_instance.treasury_account_identifier: taa_instance.delete() logger.info(" Row contains Financing TAS, Skipping...") continue load_data_into_model(taa_instance, row, field_map=field_map, value_map=value_map, save=True) if count % 1000 == 0: logger.info(" Loaded {} rows of {}".format(count, total_count))
def insert_new_fpds(self, to_insert, total_rows): place_of_performance_field_map = { "location_country_code": "place_of_perform_country_c", "country_name": "place_of_perf_country_desc", "state_code": "place_of_performance_state", "state_name": "place_of_perfor_state_desc", "city_name": "place_of_perform_city_name", "county_name": "place_of_perform_county_na", "county_code": "place_of_perform_county_co", "zip_4a": "place_of_performance_zip4a", "congressional_code": "place_of_performance_congr", "zip_last4": "place_of_perform_zip_last4", "zip5": "place_of_performance_zip5", } legal_entity_location_field_map = { "location_country_code": "legal_entity_country_code", "country_name": "legal_entity_country_name", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_descrip", "city_name": "legal_entity_city_name", "county_name": "legal_entity_county_name", "county_code": "legal_entity_county_code", "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "zip4": "legal_entity_zip4", "congressional_code": "legal_entity_congressional", "zip_last4": "legal_entity_zip_last4", "zip5": "legal_entity_zip5", } for index, row in enumerate(to_insert, 1): upper_case_dict_values(row) # Create new LegalEntityLocation and LegalEntity from the row data legal_entity_location = create_location( legal_entity_location_field_map, row, { "recipient_flag": True, "is_fpds": True }) recipient_name = row["awardee_or_recipient_legal"] legal_entity = LegalEntity.objects.create( recipient_unique_id=row["awardee_or_recipient_uniqu"], recipient_name=recipient_name if recipient_name is not None else "", ) legal_entity_value_map = { "location": legal_entity_location, "business_categories": get_business_categories(row=row, data_type="fpds"), "is_fpds": True, } set_legal_entity_boolean_fields(row) legal_entity = load_data_into_model( legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location = create_location(place_of_performance_field_map, row, {"place_of_performance_flag": True}) # Find the toptier awards from the subtier awards awarding_agency = Agency.get_by_subtier_only( row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_subtier_only( row["funding_sub_tier_agency_co"]) # Generate the unique Award ID # "CONT_AW_" + agency_id + referenced_idv_agency_iden + piid + parent_award_id generated_unique_id = ( "CONT_AW_" + (row["agency_id"] if row["agency_id"] else "-NONE-") + "_" + (row["referenced_idv_agency_iden"] if row["referenced_idv_agency_iden"] else "-NONE-") + "_" + (row["piid"] if row["piid"] else "-NONE-") + "_" + (row["parent_award_id"] if row["parent_award_id"] else "-NONE-")) # Create the summary Award (created, award) = Award.get_or_create_summary_award( generated_unique_award_id=generated_unique_id, piid=row["piid"]) award.parent_award_piid = row.get("parent_award_id") award.save() # Append row to list of Awards updated AWARD_UPDATE_ID_LIST.append(award.id) try: last_mod_date = datetime.strptime(str( row["last_modified"]), "%Y-%m-%d %H:%M:%S.%f").date() except ValueError: last_mod_date = datetime.strptime(str(row["last_modified"]), "%Y-%m-%d %H:%M:%S").date() parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": funding_agency, "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row["period_of_performance_star"]), "period_of_performance_current_end_date": format_date(row["period_of_performance_curr"]), "action_date": format_date(row["action_date"]), "last_modified_date": last_mod_date, "transaction_unique_id": row["detached_award_proc_unique"], "generated_unique_award_id": generated_unique_id, "is_fpds": True, } contract_field_map = { "type": "contract_award_type", "type_description": "contract_award_type_desc", "description": "award_description", } transaction_normalized_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=True, ) contract_instance = load_data_into_model( TransactionFPDS(), row, as_dict=True) # thrown away detached_award_proc_unique = contract_instance[ "detached_award_proc_unique"] unique_fpds = TransactionFPDS.objects.filter( detached_award_proc_unique=detached_award_proc_unique) if unique_fpds.first(): transaction_normalized_dict["update_date"] = datetime.now( timezone.utc) transaction_normalized_dict["fiscal_year"] = fy( transaction_normalized_dict["action_date"]) # update TransactionNormalized TransactionNormalized.objects.filter( id=unique_fpds.first().transaction.id).update( **transaction_normalized_dict) # update TransactionFPDS unique_fpds.update(**contract_instance) else: # create TransactionNormalized transaction = TransactionNormalized( **transaction_normalized_dict) transaction.save() # create TransactionFPDS transaction_fpds = TransactionFPDS(transaction=transaction, **contract_instance) transaction_fpds.save() # Update legal entity to map back to transaction legal_entity.transaction_unique_id = detached_award_proc_unique legal_entity.save()
def load_file_c(submission_attributes, db_cursor, award_financial_frame): """ Process and load file C broker data. Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we # change up the order of file loading if not award_financial_frame.size: logger.warning("No File C (award financial) data found, skipping...") return reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} award_financial_frame["object_class"] = award_financial_frame.apply( get_object_class_row, axis=1) award_financial_frame["program_activity"] = award_financial_frame.apply( get_or_create_program_activity, axis=1, submission_attributes=submission_attributes) total_rows = award_financial_frame.shape[0] start_time = datetime.now() awards_touched = [] # format award_financial_frame award_financial_frame = award_financial_frame.replace({np.nan: None}) for index, row in enumerate( award_financial_frame.to_dict(orient="records"), 1): if not (index % 100): logger.info("C File Load: Loading row {} of {} ({})".format( str(index), str(total_rows), datetime.now() - start_time)) upper_case_dict_values(row) # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get("tas_id"), db_cursor) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # Find a matching transaction record, so we can use its subtier agency information to match to (or create) an # Award record. # Find the award that this award transaction belongs to. If it doesn't exist, create it. filters = {} if row.get("piid"): filters["piid"] = row.get("piid") filters["parent_piid"] = row.get("parent_award_id") else: if row.get("fain") and not row.get("uri"): filters["fain"] = row.get("fain") elif row.get("uri") and not row.get("fain"): filters["uri"] = row.get("uri") else: filters["fain"] = row.get("fain") filters["uri"] = row.get("uri") award = find_matching_award(**filters) if award: awards_touched += [award] award_financial_data = FinancialAccountsByAwards() value_map_faba = { "award": award, "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, "treasury_account": treasury_account, "object_class": row.get("object_class"), "program_activity": row.get("program_activity"), } # Still using the cpe|fyb regex compiled above for reverse load_data_into_model(award_financial_data, row, value_map=value_map_faba, save=True, reverse=reverse) for key in skipped_tas: logger.info("Skipped %d rows due to missing TAS: %s", skipped_tas[key]["count"], key) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]["count"] logger.info( "Skipped a total of {} TAS rows for File C".format(total_tas_skipped)) return [award.id for award in awards_touched if award]
def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): """ Process and load file B broker data (aka TAS balances by program activity and object class). """ reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") # dictionary to capture TAS that were skipped and some metadata # tas = top-level key # count = number of rows skipped # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted skipped_tas = {} bulk_treasury_appropriation_account_tas_lookup(prg_act_obj_cls_data, db_cursor) save_manager = BulkCreateManager( FinancialAccountsByProgramActivityObjectClass) for row in prg_act_obj_cls_data: # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup( row.get("tas_id")) if treasury_account is None: update_skipped_tas(row, tas_rendering_label, skipped_tas) continue # get the corresponding account balances row (aka "File A" record) account_balances = AppropriationAccountBalances.objects.get( treasury_account_identifier=treasury_account, submission_id=submission_attributes.submission_id) financial_by_prg_act_obj_cls = FinancialAccountsByProgramActivityObjectClass( ) value_map = { "submission": submission_attributes, "reporting_period_start": submission_attributes.reporting_period_start, "reporting_period_end": submission_attributes.reporting_period_end, "treasury_account": treasury_account, "appropriation_account_balances": account_balances, "object_class": get_object_class(row["object_class"], row["by_direct_reimbursable_fun"]), "program_activity": get_program_activity(row, submission_attributes), "disaster_emergency_fund": get_disaster_emergency_fund(row), } save_manager.append( load_data_into_model(financial_by_prg_act_obj_cls, row, value_map=value_map, save=False, reverse=reverse)) save_manager.save_stragglers() for key in skipped_tas: logger.info( f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}" ) total_tas_skipped = 0 for key in skipped_tas: total_tas_skipped += skipped_tas[key]["count"] logger.info( f"Skipped a total of {total_tas_skipped:,} TAS rows for File B")
def update_transaction_contract(db_cursor, fiscal_year=None, page=1, limit=500000): # logger.info("Getting IDs for what's currently in the DB...") # current_ids = TransactionFPDS.objects # # if fiscal_year: # current_ids = current_ids.filter(action_date__fy=fiscal_year) # # current_ids = current_ids.values_list('detached_award_procurement_id', flat=True) query = "SELECT * FROM detached_award_procurement" arguments = [] fy_begin = '10/01/' + str(fiscal_year - 1) fy_end = '09/30/' + str(fiscal_year) if fiscal_year: if arguments: query += " AND" else: query += " WHERE" query += ' action_date::Date BETWEEN %s AND %s' arguments += [fy_begin] arguments += [fy_end] query += ' ORDER BY detached_award_procurement_id LIMIT %s OFFSET %s' arguments += [limit, (page - 1) * limit] logger.info("Executing query on Broker DB => " + query % (arguments[0], arguments[1], arguments[2], arguments[3])) db_cursor.execute(query, arguments) logger.info("Running dictfetchall on db_cursor") procurement_data = dictfetchall(db_cursor) legal_entity_location_field_map = { "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "location_country_code": "legal_entity_country_code", "city_name": "legal_entity_city_name", "congressional_code": "legal_entity_congressional", "state_code": "legal_entity_state_code", "zip4": "legal_entity_zip4" } legal_entity_location_value_map = {"recipient_flag": True} place_of_performance_field_map = { # not sure place_of_performance_locat maps exactly to city name # "city_name": "place_of_performance_locat", # location id doesn't mean it's a city. Can't use this mapping "congressional_code": "place_of_performance_congr", "state_code": "place_of_performance_state", "zip4": "place_of_performance_zip4a", "location_country_code": "place_of_perform_country_c" } place_of_performance_value_map = {"place_of_performance_flag": True} contract_field_map = { "type": "contract_award_type", "description": "award_description" } logger.info("Getting total rows") # rows_loaded = len(current_ids) total_rows = len(procurement_data) # - rows_loaded logger.info("Processing " + str(total_rows) + " rows of procurement data") # skip_count = 0 start_time = datetime.now() for index, row in enumerate(procurement_data, 1): with db_transaction.atomic(): # if TransactionFPDS.objects.values('detached_award_procurement_id').\ # filter(detached_award_procurement_id=str(row['detached_award_procurement_id'])).first(): # skip_count += 1 # # if not (skip_count % 100): # logger.info('Skipped {} records so far'.format(str(skip_count))) if not (index % 100): logger.info( 'D1 File Load: Loading row {} of {} ({})'.format( str(index), str(total_rows), datetime.now() - start_time)) recipient_name = row['awardee_or_recipient_legal'] if recipient_name is None: recipient_name = "" legal_entity_location, created = get_or_create_location( legal_entity_location_field_map, row, copy(legal_entity_location_value_map)) # Create the legal entity if it doesn't exist legal_entity, created = LegalEntity.objects.get_or_create( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name) if created: legal_entity_value_map = { "location": legal_entity_location, } legal_entity = load_data_into_model( legal_entity, row, value_map=legal_entity_value_map, save=True) # Create the place of performance location pop_location, created = get_or_create_location( place_of_performance_field_map, row, copy(place_of_performance_value_map)) # If awarding toptier agency code (aka CGAC) is not supplied on the D2 record, # use the sub tier code to look it up. This code assumes that all incoming # records will supply an awarding subtier agency code if row['awarding_agency_code'] is None or len( row['awarding_agency_code'].strip()) < 1: awarding_subtier_agency_id = subtier_agency_map[ row["awarding_sub_tier_agency_c"]] awarding_toptier_agency_id = subtier_to_agency_map[ awarding_subtier_agency_id]['toptier_agency_id'] awarding_cgac_code = toptier_agency_map[ awarding_toptier_agency_id] row['awarding_agency_code'] = awarding_cgac_code # If funding toptier agency code (aka CGAC) is empty, try using the sub # tier funding code to look it up. Unlike the awarding agency, we can't # assume that the funding agency subtier code will always be present. if row['funding_agency_code'] is None or len( row['funding_agency_code'].strip()) < 1: funding_subtier_agency_id = subtier_agency_map.get( row["funding_sub_tier_agency_co"]) if funding_subtier_agency_id is not None: funding_toptier_agency_id = \ subtier_to_agency_map[funding_subtier_agency_id]['toptier_agency_id'] funding_cgac_code = toptier_agency_map[ funding_toptier_agency_id] else: funding_cgac_code = None row['funding_agency_code'] = funding_cgac_code # Find the award that this award transaction belongs to. If it doesn't exist, create it. awarding_agency = Agency.get_by_toptier_subtier( row['awarding_agency_code'], row["awarding_sub_tier_agency_c"]) created, award = Award.get_or_create_summary_award( awarding_agency=awarding_agency, piid=row.get('piid'), fain=row.get('fain'), uri=row.get('uri'), parent_award_piid=row.get('parent_award_id')) award.save() award_update_id_list.append(award.id) award_contract_update_id_list.append(award.id) parent_txn_value_map = { "award": award, "awarding_agency": awarding_agency, "funding_agency": Agency.get_by_toptier_subtier( row['funding_agency_code'], row["funding_sub_tier_agency_co"]), "recipient": legal_entity, "place_of_performance": pop_location, "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), } transaction_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=contract_field_map, value_map=parent_txn_value_map, as_dict=True) transaction = TransactionNormalized.get_or_create_transaction( **transaction_dict) transaction.save() contract_instance = load_data_into_model( TransactionFPDS(), # thrown away row, as_dict=True) transaction_contract = TransactionFPDS(transaction=transaction, **contract_instance) # catch exception and do nothing if we see # "django.db.utils.IntegrityError: duplicate key value violates unique constraint" try: transaction_contract.save() except IntegrityError: pass
def get_submission_attributes(broker_submission_id, submission_data): """ For a specified broker submission, return the existing corresponding usaspending submission record or create and return a new one. """ # check if we already have an entry for this broker submission id; if not, create one submission_attributes, created = SubmissionAttributes.objects.get_or_create( broker_submission_id=broker_submission_id) if created: # this is the first time we're loading this broker submission logger.info( 'Creating broker submission id {}'.format(broker_submission_id)) else: # we've already loaded this broker submission, so delete it before reloading # if there's another submission that references this one as a "previous submission" # do not proceed. # TODO: now that we're chaining submisisons together, get clarification on # what should happen when a submission in the middle of the chain is deleted downstream_submission = SubmissionAttributes.objects.filter( previous_submission=submission_attributes).first() if downstream_submission is not None: message = ( 'Broker submission {} (API submission id = {}) has a downstream submission (id={}) and ' 'cannot be deleted'.format( broker_submission_id, submission_attributes.submission_id, downstream_submission.submission_id)) raise ValueError(message) logger.info( 'Broker submission id {} already exists. It will be deleted.'. format(broker_submission_id)) call_command('rm_submission', broker_submission_id) # Find the previous submission for this CGAC and fiscal year (if there is one) previous_submission = get_previous_submission( submission_data['cgac_code'], submission_data['reporting_fiscal_year'], submission_data['reporting_fiscal_period']) # Update and save submission attributes field_map = { 'reporting_period_start': 'reporting_start_date', 'reporting_period_end': 'reporting_end_date', 'quarter_format_flag': 'is_quarter_format', } # Create our value map - specific data to load value_map = { 'broker_submission_id': broker_submission_id, 'reporting_fiscal_quarter': get_fiscal_quarter(submission_data['reporting_fiscal_period']), 'previous_submission': None if previous_submission is None else previous_submission, # pull in broker's last update date to use as certified date 'certified_date': submission_data['updated_at'].date() if type(submission_data['updated_at']) == datetime else None, } return load_data_into_model(submission_attributes, submission_data, field_map=field_map, value_map=value_map, save=True)
def update_transaction_assistance(db_cursor, fiscal_year=None, page=1, limit=500000): # logger.info("Getting IDs for what's currently in the DB...") # current_ids = TransactionFABS.objects # # if fiscal_year: # current_ids = current_ids.filter(action_date__fy=fiscal_year) # # current_ids = current_ids.values_list('published_award_financial_assistance_id', flat=True) query = "SELECT * FROM published_award_financial_assistance" arguments = [] fy_begin = '10/01/' + str(fiscal_year - 1) fy_end = '09/30/' + str(fiscal_year) if fiscal_year: if arguments: query += " AND" else: query += " WHERE" query += ' action_date::Date BETWEEN %s AND %s' arguments += [fy_begin] arguments += [fy_end] query += ' ORDER BY published_award_financial_assistance_id LIMIT %s OFFSET %s' arguments += [limit, (page - 1) * limit] logger.info("Executing query on Broker DB => " + query % (arguments[0], arguments[1], arguments[2], arguments[3])) db_cursor.execute(query, arguments) logger.info("Running dictfetchall on db_cursor") award_financial_assistance_data = dictfetchall(db_cursor) legal_entity_location_field_map = { "address_line1": "legal_entity_address_line1", "address_line2": "legal_entity_address_line2", "address_line3": "legal_entity_address_line3", "city_name": "legal_entity_city_name", "congressional_code": "legal_entity_congressional", "county_code": "legal_entity_county_code", "county_name": "legal_entity_county_name", "foreign_city_name": "legal_entity_foreign_city", "foreign_postal_code": "legal_entity_foreign_posta", "foreign_province": "legal_entity_foreign_provi", "state_code": "legal_entity_state_code", "state_name": "legal_entity_state_name", "zip5": "legal_entity_zip5", "zip_last4": "legal_entity_zip_last4", "location_country_code": "legal_entity_country_code" } place_of_performance_field_map = { "city_name": "place_of_performance_city", "performance_code": "place_of_performance_code", "congressional_code": "place_of_performance_congr", "county_name": "place_of_perform_county_na", "foreign_location_description": "place_of_performance_forei", "state_name": "place_of_perform_state_nam", "zip4": "place_of_performance_zip4a", "location_country_code": "place_of_perform_country_c" } fad_field_map = { "type": "assistance_type", "description": "award_description", } logger.info("Getting total rows") # rows_loaded = len(current_ids) total_rows = len(award_financial_assistance_data) # - rows_loaded logger.info("Processing " + str(total_rows) + " rows of assistance data") # skip_count = 0 # ROW ITERATION STARTS HERE lel_bulk = [] pop_bulk = [] legal_entity_bulk = [] award_bulk = [] transaction_assistance_bulk = [] transaction_normalized_bulk = [] logger.info( 'Getting legal entity location objects for {} rows...'.format( len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # Recipient flag is true for LeL legal_entity_location = get_or_create_location( legal_entity_location_field_map, row, {"recipient_flag": True}, save=False) lel_bulk.append(legal_entity_location) logger.info('Bulk creating {} legal entity location rows...'.format( len(lel_bulk))) try: Location.objects.bulk_create(lel_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info( 'Getting place of performance objects for {} rows...'.format( len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # Place of Performance flag is true for PoP pop_location = get_or_create_location( place_of_performance_field_map, row, {"place_of_performance_flag": True}, save=False) pop_bulk.append(pop_location) logger.info('Bulk creating {} place of performance rows...'.format( len(pop_bulk))) try: Location.objects.bulk_create(pop_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info('Getting legal entity objects for {} rows...'.format( len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): recipient_name = row.get('awardee_or_recipient_legal', '') legal_entity = LegalEntity.objects.filter( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name).first() if legal_entity is None: legal_entity = LegalEntity( recipient_unique_id=row['awardee_or_recipient_uniqu'], recipient_name=recipient_name) legal_entity_value_map = { "location": lel_bulk[index - 1], } legal_entity = load_data_into_model( legal_entity, row, value_map=legal_entity_value_map, save=False) legal_entity_bulk.append(legal_entity) logger.info('Bulk creating {} legal entity rows...'.format( len(legal_entity_bulk))) try: LegalEntity.objects.bulk_create(legal_entity_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') awarding_agency_list = [] funding_agency_list = [] logger.info('Getting award objects for {} rows...'.format( len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): # If awarding toptier agency code (aka CGAC) is not supplied on the D2 record, # use the sub tier code to look it up. This code assumes that all incoming # records will supply an awarding subtier agency code if row['awarding_agency_code'] is None or len( row['awarding_agency_code'].strip()) < 1: awarding_subtier_agency_id = subtier_agency_map[ row["awarding_sub_tier_agency_c"]] awarding_toptier_agency_id = subtier_to_agency_map[ awarding_subtier_agency_id]['toptier_agency_id'] awarding_cgac_code = toptier_agency_map[ awarding_toptier_agency_id] row['awarding_agency_code'] = awarding_cgac_code # If funding toptier agency code (aka CGAC) is empty, try using the sub # tier funding code to look it up. Unlike the awarding agency, we can't # assume that the funding agency subtier code will always be present. if row['funding_agency_code'] is None or len( row['funding_agency_code'].strip()) < 1: funding_subtier_agency_id = subtier_agency_map.get( row["funding_sub_tier_agency_co"]) if funding_subtier_agency_id is not None: funding_toptier_agency_id = \ subtier_to_agency_map[funding_subtier_agency_id]['toptier_agency_id'] funding_cgac_code = toptier_agency_map[ funding_toptier_agency_id] else: funding_cgac_code = None row['funding_agency_code'] = funding_cgac_code # Find the award that this award transaction belongs to. If it doesn't exist, create it. awarding_agency = Agency.get_by_toptier_subtier( row['awarding_agency_code'], row["awarding_sub_tier_agency_c"]) funding_agency = Agency.get_by_toptier_subtier( row['funding_agency_code'], row["funding_sub_tier_agency_co"]) awarding_agency_list.append(awarding_agency) funding_agency_list.append(funding_agency) # award.save() is called in Award.get_or_create_summary_award by default created, award = Award.get_or_create_summary_award( awarding_agency=awarding_agency, fain=row.get('fain'), uri=row.get('uri'), save=False) award_bulk.append(award) award_update_id_list.append(award.id) logger.info('Bulk creating {} award rows...'.format(len(award_bulk))) try: Award.objects.bulk_create(award_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ') logger.info('Getting transaction_normalized for {} rows...'.format( len(award_financial_assistance_data))) for index, row in enumerate(award_financial_assistance_data, 1): parent_txn_value_map = { "award": award_bulk[index - 1], "awarding_agency": awarding_agency_list[index - 1], "funding_agency": funding_agency_list[index - 1], "recipient": legal_entity_bulk[index - 1], "place_of_performance": pop_bulk[index - 1], "period_of_performance_start_date": format_date(row['period_of_performance_star']), "period_of_performance_current_end_date": format_date(row['period_of_performance_curr']), "action_date": format_date(row['action_date']), } transaction_dict = load_data_into_model( TransactionNormalized(), # thrown away row, field_map=fad_field_map, value_map=parent_txn_value_map, as_dict=True) transaction_normalized = TransactionNormalized.get_or_create_transaction( **transaction_dict) transaction_normalized.fiscal_year = fy( transaction_normalized.action_date) transaction_normalized_bulk.append(transaction_normalized) logger.info('Bulk creating {} TransactionNormalized rows...'.format( len(transaction_normalized_bulk))) try: TransactionNormalized.objects.bulk_create( transaction_normalized_bulk) except IntegrityError: logger.info( 'Tried and failed to insert duplicate transaction_normalized row. Continuing... ' ) for index, row in enumerate(award_financial_assistance_data, 1): financial_assistance_data = load_data_into_model( TransactionFABS(), # thrown away row, as_dict=True) transaction_assistance = TransactionFABS( transaction=transaction_normalized_bulk[index - 1], **financial_assistance_data) transaction_assistance_bulk.append(transaction_assistance) logger.info('Bulk creating TransactionFABS rows...') try: TransactionFABS.objects.bulk_create(transaction_assistance_bulk) except IntegrityError: logger.info('!!! DUPLICATES FOUND. Continuing... ')