def update_state_congr_table_census(census_file, sess): """ Update contents of state_congressional table to include districts from the census Args: census_file: file path/url to the census file to read sess: the database connection """ logger.info( "Adding congressional districts from census to the state_congressional table" ) data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, { "state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year" }, {'congressional_district_no': { "pad_to_length": 2 }}) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def create_temp_exec_comp_table(sess, table_name, data): """ Creates a temporary executive compensation table with the given name and data. Args: sess: database connection table_name: what to name the table being created data: pandas dataframe representing exec comp data """ logger.info('Making {} table'.format(table_name)) create_table_sql = """ CREATE TABLE IF NOT EXISTS {} ( awardee_or_recipient_uniqu TEXT, high_comp_officer1_amount TEXT, high_comp_officer1_full_na TEXT, high_comp_officer2_amount TEXT, high_comp_officer2_full_na TEXT, high_comp_officer3_amount TEXT, high_comp_officer3_full_na TEXT, high_comp_officer4_amount TEXT, high_comp_officer4_full_na TEXT, high_comp_officer5_amount TEXT, high_comp_officer5_full_na TEXT, last_exec_comp_mod_date DATE ); """.format(table_name) sess.execute(create_table_sql) # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE {};'.format(table_name)) insert_dataframe(data, table_name, sess.connection())
def create_temp_sam_recipient_table(sess, table_name, data): """ Creates a temporary SAM table with the given name and data. Args: sess: database connection table_name: what to name the table being created data: pandas dataframe representing SAM data """ logger.info('Making {} table'.format(table_name)) column_types = { 'created_at': 'TIMESTAMP WITHOUT TIME ZONE', 'updated_at': 'TIMESTAMP WITHOUT TIME ZONE', 'uei': 'TEXT', 'awardee_or_recipient_uniqu': 'TEXT', 'activation_date': 'DATE', 'expiration_date': 'DATE', 'deactivation_date': 'DATE', 'registration_date': 'DATE', 'last_sam_mod_date': 'DATE', 'legal_business_name': 'TEXT', 'dba_name': 'TEXT', 'ultimate_parent_uei': 'TEXT', 'ultimate_parent_unique_ide': 'TEXT', 'ultimate_parent_legal_enti': 'TEXT', 'address_line_1': 'TEXT', 'address_line_2': 'TEXT', 'city': 'TEXT', 'state': 'TEXT', 'zip': 'TEXT', 'zip4': 'TEXT', 'country_code': 'TEXT', 'congressional_district': 'TEXT', 'business_types_codes': 'TEXT[]', 'business_types': 'TEXT[]', 'entity_structure': 'TEXT', 'high_comp_officer1_amount': 'TEXT', 'high_comp_officer1_full_na': 'TEXT', 'high_comp_officer2_amount': 'TEXT', 'high_comp_officer2_full_na': 'TEXT', 'high_comp_officer3_amount': 'TEXT', 'high_comp_officer3_full_na': 'TEXT', 'high_comp_officer4_amount': 'TEXT', 'high_comp_officer4_full_na': 'TEXT', 'high_comp_officer5_amount': 'TEXT', 'high_comp_officer5_full_na': 'TEXT', 'last_exec_comp_mod_date': 'DATE' } columns = ', '.join(['{} {}'.format(column_name, column_type) for column_name, column_type in column_types.items() if column_name in list(data.columns)]) create_table_sql = 'CREATE TABLE IF NOT EXISTS {} ({});'.format(table_name, columns) sess.execute(create_table_sql) # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE {};'.format(table_name)) insert_dataframe(data, table_name, sess.connection())
def run_duns_batches(file, sess, client, block_size=10000): """ Updates DUNS table in chunks from csv file Args: file: path to the DUNS export file to use sess: the database connection client: the connection to the SAM service block_size: the size of the batches to read from the DUNS export file. """ logger.info("Retrieving total rows from duns file") start = datetime.now() duns_reader_obj = pd.read_csv(file, skipinitialspace=True, header=None, quotechar='"', dtype=str, names=column_headers, iterator=True, chunksize=block_size, skiprows=1) duns_dfs = [duns_df for duns_df in duns_reader_obj] row_count = sum([len(duns_df.index) for duns_df in duns_dfs]) logger.info("Retrieved row count of {} in {} s".format( row_count, (datetime.now() - start).total_seconds())) duns_added = 0 for duns_df in duns_dfs: # Remove rows where awardee_or_recipient_uniqu is null duns_df = duns_df[duns_df['awardee_or_recipient_uniqu'].notnull()] # Ignore old DUNS we already have duns_to_load = remove_existing_duns(duns_df, sess) if not duns_to_load.empty: logger.info("Adding {} DUNS records from historic data".format( len(duns_to_load.index))) start = datetime.now() # get address info for incoming duns duns_to_load = update_duns_props(duns_to_load, client) duns_to_load = clean_data(duns_to_load, HistoricDUNS, column_mappings, {}) duns_added += len(duns_to_load.index) insert_dataframe(duns_to_load, HistoricDUNS.__table__.name, sess.connection()) sess.commit() logger.info("Finished updating {} DUNS rows in {} s".format( len(duns_to_load.index), (datetime.now() - start).total_seconds())) logger.info("Imported {} historical duns".format(duns_added))
def parse_sam_file(file, sess): logger.info("starting file " + str(file.name)) csv_file = os.path.splitext(os.path.basename(file.name))[0]+'.dat' zfile = zipfile.ZipFile(file.name) # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype nrows = 0 with zfile.open(csv_file) as f: nrows = len(f.readlines()) - 2 # subtract the header and footer column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract": 4, "expiration_date": 7, "activation_date": 9, "ultimate_parent_legal_enti": 10, "ultimate_parent_unique_ide": 48, "exec_comp_str": 89 } column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) csv_data = pd.read_csv(zfile.open(csv_file), dtype=str, header=None, skiprows=1, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys()) total_data = csv_data.copy() # skipping when sam_extract == '4' as it's expired total_data = total_data[total_data.sam_extract != '4'] # parse out executive compensation from row 90 lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values()))) parsed_data = total_data["exec_comp_str"].apply(lambda_func) parsed_data.columns = list(parse_exec_comp().keys()) del total_data["exec_comp_str"] total_data = total_data.join(parsed_data) # split into 3 dataframes based on row 8 ('1', '2', '3') delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan, "", regex=True) add_data = total_data[total_data.sam_extract == '2'].replace(np.nan, "", regex=True) update_data = total_data[total_data.sam_extract == '3'].replace(np.nan, "", regex=True) for dataframe in [add_data, update_data, delete_data, total_data]: del dataframe["sam_extract"] table_name = ExecutiveCompensation.__table__.name insert_dataframe(add_data, table_name, sess.connection()) for _, row in update_data.iterrows(): sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\ update(row, synchronize_session=False) for _, row in delete_data.iterrows(): sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\ delete(synchronize_session=False) sess.commit()
def load_zip_city_data(force_reload): """ Load data into the ZipCity table Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt"}, ExpiresIn=600) zip_city_file = urllib.request.urlopen(citystate_file) else: citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") zip_city_file = open(citystate_file) new_data = parse_zip_city_file(zip_city_file) diff_found = check_dataframe_diff(new_data, ZipCity, ['zip_city_id'], ['zip_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info('Differences found or reload forced, reloading zip_city table.') # delete any data in the ZipCity table sess.query(ZipCity).delete() # insert data into table num = insert_dataframe(new_data, ZipCity.__table__.name, sess.connection()) logger.info('{} records inserted to zip_city'.format(num)) sess.commit() else: logger.info('No differences found, skipping zip_city table reload.')
def load_state_data(force_reload): """ Load data into the States table Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url']) with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file: new_data = parse_state_file(state_file) diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info('Differences found or reload forced, reloading states table.') # delete any data in the States table sess.query(States).delete() # insert data into table num = insert_dataframe(new_data, States.__table__.name, sess.connection()) logger.info('{} records inserted to states'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'state_code') else: logger.info('No differences found, skipping states table reload.')
def parse_county_file(county_file, sess): """ Parse the County file and insert all relevant rows into the database. Args: county_file: path/url to file to gather County data from sess: database session """ # read the data and clean up the column names data = pd.read_csv(county_file, dtype=str, sep="|") data = clean_data( data, {"COUNTY_NUMERIC": "county_number", "COUNTY_NAME": "county_name", "STATE_ALPHA": "state_code"}) # remove all blank county_number rows. Not much use in a county number table data = data[pd.notnull(data['county_number'])] # remove duplicates because we have no use for them (there may be none, this is a precaution) data = data[~data.duplicated(subset=['county_number', 'state_code'], keep='first')] # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # insert data into table num = insert_dataframe(data, CountyCode.__table__.name, sess.connection()) logger.info('{} records inserted to county_code'.format(num)) sess.commit()
def load_county_data(county_file, force_reload): """ Load data into the CountyCode table Args: county_file: path/url to file to gather County data from force_reload: boolean to determine if reload should happen whether there are differences or not """ new_data = parse_county_file(county_file) diff_found = check_dataframe_diff(new_data, CountyCode, 'county_code_id', ['county_number', 'state_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info( 'Differences found or reload forced, reloading county_code table.') # delete any data in the CountyCode table sess.query(CountyCode).delete() # insert data into table num = insert_dataframe(new_data, CountyCode.__table__.name, sess.connection()) logger.info('{} records inserted to county_code'.format(num)) sess.commit() else: logger.info('No differences found, skipping county_code table reload.')
def parse_county_file(county_file, sess): """ Parse the County file and insert all relevant rows into the database. Args: county_file: path/url to file to gather County data from sess: database session """ # read the data and clean up the column names data = pd.read_csv(county_file, dtype=str, sep="|") data = clean_data( data, { "COUNTY_NUMERIC": "county_number", "COUNTY_NAME": "county_name", "STATE_ALPHA": "state_code" }) # remove all blank county_number rows. Not much use in a county number table data = data[pd.notnull(data['county_number'])] # remove duplicates because we have no use for them (there may be none, this is a precaution) data = data[ ~data.duplicated(subset=['county_number', 'state_code'], keep='first')] # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # insert data into table num = insert_dataframe(data, CountyCode.__table__.name, sess.connection()) logger.info('{} records inserted to county_code'.format(num)) sess.commit()
def load_country_codes(base_path): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_country_codes.py', 'start_time': str(now), 'records_deleted': 0, 'records_provided': 0, 'duplicates_dropped': 0, 'records_inserted': 0 } if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "country_codes.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "country_codes.csv") logger.info('Loading country codes file: country_codes.csv') with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values metrics_json['records_deleted'] = sess.query(CountryCode).delete() data = pd.read_csv(filename, dtype=str) metrics_json['records_provided'] = len(data.index) data = clean_data(data, CountryCode, { "country_code": "country_code", "country_name": "country_name" }, {}) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json[ 'records_provided'] - len(data.index) # flag territories or freely associated states data["territory_free_state"] = np.where( data["country_code"].isin(TERRITORIES_FREE_STATES), True, False) # insert to db table_name = CountryCode.__table__.name num = insert_dataframe(data, table_name, sess.connection()) metrics_json['records_inserted'] = num sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_country_codes_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info("Script complete")
def load_zip_city_data(zip_city_file, force_reload): """ Load data into the ZipCity table Args: zip_city_file: path/url to file to gather ZipCity data from force_reload: boolean to determine if reload should happen whether there are differences or not """ new_data = parse_zip_city_file(zip_city_file) diff_found = check_dataframe_diff(new_data, ZipCity, 'zip_city_id', ['zip_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info( 'Differences found or reload forced, reloading zip_city table.') # delete any data in the ZipCity table sess.query(ZipCity).delete() # insert data into table num = insert_dataframe(new_data, ZipCity.__table__.name, sess.connection()) logger.info('{} records inserted to zip_city'.format(num)) sess.commit() else: logger.info('No differences found, skipping zip_city table reload.')
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "object_class.csv"}, ExpiresIn=600) else: filename = os.path.join(base_path, "object_class.csv") # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"}, {"object_class_code": {"pad_to_length": 3}} ) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def load_quarterly_threshold(): """ Loads the quarterly revalidation threshold data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) threshold_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "quarterly_submission_dates.csv"}, ExpiresIn=600) else: threshold_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "quarterly_submission_dates.csv") logger.info('Loading quarterly revalidation threshold data') with create_app().app_context(): data = pd.read_csv(threshold_file, dtype=str) data = clean_data( data, QuarterlyRevalidationThreshold, {"year": "year", "quarter": "quarter", "window_start": "window_start", "window_end": "window_end"}, {} ) sess = GlobalDB.db().session # delete any data in the QuarterlyRevalidationThreshold table sess.query(QuarterlyRevalidationThreshold).delete() # insert data into table num = insert_dataframe(data, QuarterlyRevalidationThreshold.__table__.name, sess.connection()) logger.info('{} records inserted to quarterly_revalidation_threshold'.format(num)) sess.commit()
def load_sf133(sess, filename, fiscal_year, fiscal_period, force_sf133_load=False, metrics=None): """ Load SF 133 (budget execution report) lookup table. Args: sess: connection to database filename: name/path of the file to read in fiscal_year: fiscal year of the file being loaded fiscal_period: fiscal period of the file being loaded force_sf133_load: boolean to indicate whether to force a reload of the data metrics: an object containing information for the metrics file """ if not metrics: metrics = {} existing_records = sess.query(SF133).filter(SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period) if force_sf133_load: # force a reload of this period's current data logger.info('Force SF 133 load: deleting existing records for %s %s', fiscal_year, fiscal_period) delete_count = existing_records.delete() logger.info('%s records deleted', delete_count) metrics['records_deleted'] += delete_count elif existing_records.count(): # if there's existing data & we're not forcing a load, skip logger.info('SF133 %s %s already in database (%s records). Skipping file.', fiscal_year, fiscal_period, existing_records.count()) return data = clean_sf133_data(filename, SF133) # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones we don't actually # use in the validations. Arguably, it would be better just to include everything, but that drastically # increases the number of records we're inserting to the sf_133 table. If we ever decide that we need *all* # SF 133 lines that are zero value, remove the next two lines. sf_133_validation_lines = [ '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032', '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280', '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190', '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901', '4902', '4908', '4981', '4982' ] data = data[(data.line.isin(sf_133_validation_lines)) | (data.amount != 0)] # we didn't use the the 'keep_null' option when padding allocation transfer agency, because nulls in that column # break the pivot (see above comments). so, replace the ata '000' with an empty value before inserting to db data['allocation_transfer_agency'] = data['allocation_transfer_agency'].str.replace('000', '') # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as # NULL in the db. data = data.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) # Keeping display_tas out here as it depends on empty allocation_transfer_agency being None and not 000 data['display_tas'] = data.apply(lambda row: concat_display_tas_dict(row), axis=1) # insert to db table_name = SF133.__table__.name num = insert_dataframe(data, table_name, sess.connection()) metrics['records_inserted'] += num update_account_num(int(fiscal_year), int(fiscal_period)) sess.commit() logger.info('%s records inserted to %s', num, table_name)
def insert_file(filename, submission_id, file_type_id, csv_schema, long_to_short_dict): """ Insert the data from the file into the corresponding Certified table. Params: filename: filename to load submission_id: Database ID for the submission being loaded file_type_id: Database file type ID for files A, B, or C csv_schema: Schema built for this filetype's long_to_short_dict: Dict to translate long column names to the column names used by the database """ sess = GlobalDB.db().session logger.info('Copying "{}" into {} table'.format(filename, FTI_TABLENAME_DICT[file_type_id])) # If this is a file in S3, download to a local temp file first then use temp file as local file if CONFIG_BROKER['use_aws']: (file, tmp_filename) = tempfile.mkstemp() s3 = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) s3.download_file(CONFIG_BROKER['certified_bucket'], filename, tmp_filename) filename = tmp_filename with open(filename) as file: # Get file delimiter and reset reader to start of file delim = '|' if file.readline().count('|') != 0 else ',' file.seek(0) # Create dataframe from file data = pd.read_csv(file, dtype=str, delimiter=delim) # Only use the columns needed for the DB table data = data.rename(columns=lambda x: x.lower().strip()) data = data.rename(index=str, columns=long_to_short_dict) data = data[list(csv_schema.keys())] # Clean rows if len(data.index) > 0: for col in long_to_short_dict.values(): data[col] = data.apply(lambda x: clean_col(x, col, file_type_id, csv_schema), axis=1) # Populate columns that aren't in the file if len(data.index) > 0: data['tas'] = data.apply(lambda x: format_internal_tas(x), axis=1) now = datetime.datetime.now() data['created_at'] = now data['updated_at'] = now data['submission_id'] = submission_id job = sess.query(Job).filter_by(submission_id=submission_id, file_type_id=file_type_id, job_type_id=JOB_TYPE_DICT['csv_record_validation']).one() data['job_id'] = job.job_id data = data.reset_index() data['row_number'] = data.index + 2 data = data.drop(['index'], axis=1) # Load dataframe into the DB table count = insert_dataframe(data, FTI_TABLE_DICT[file_type_id].__table__.name, sess.connection()) sess.commit() logger.info('Loaded {} records into the {} table'.format(count, FTI_TABLENAME_DICT[file_type_id]))
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_object_class.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "object_class.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "object_class.csv") # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session metrics_json['records_deleted'] = sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, { "max_oc_code": "object_class_code", "max_object_class_name": "object_class_name" }, {"object_class_code": { "pad_to_length": 3 }}) metrics_json['records_received'] = len(data.index) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json[ 'records_received'] - len(data.index) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_object_class_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_sf133(filename, fiscal_year, fiscal_period, force_sf133_load=False): """Load SF 133 (budget execution report) lookup table.""" with create_app().app_context(): sess = GlobalDB.db().session existing_records = sess.query(SF133).filter( SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period) if force_sf133_load: # force a reload of this period's current data logger.info( 'Force SF 133 load: deleting existing records for %s %s', fiscal_year, fiscal_period) delete_count = existing_records.delete() logger.info('%s records deleted', delete_count) elif existing_records.count(): # if there's existing data & we're not forcing a load, skip logger.info( 'SF133 %s %s already in database (%s records). Skipping file.', fiscal_year, fiscal_period, existing_records.count()) return data = clean_sf133_data(filename, SF133) # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones # we don't actually use in the validations. Arguably, it would be better just to include # everything, but that drastically increases the number of records we're inserting to the # sf_133 table. If we ever decide that we need *all* SF 133 lines that are zero value, # remove the next two lines. sf_133_validation_lines = [ '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032', '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280', '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190', '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901', '4902', '4908', '4981', '4982' ] data = data[(data.line.isin(sf_133_validation_lines)) | (data.amount != 0)] # we didn't use the the 'keep_null' option when padding allocation transfer agency, # because nulls in that column break the pivot (see above comments). # so, replace the ata '000' with an empty value before inserting to db data['allocation_transfer_agency'] = data[ 'allocation_transfer_agency'].str.replace('000', '') # make a pass through the dataframe, changing any empty values to None, to ensure # that those are represented as NULL in the db. data = data.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) # insert to db table_name = SF133.__table__.name num = insert_dataframe(data, table_name, sess.connection()) update_tas_id(int(fiscal_year), int(fiscal_period)) sess.commit() logger.info('%s records inserted to %s', num, table_name)
def load_sql(cls, filename): """ Load SQL-based validation rules to db. """ with create_app().app_context(): sess = GlobalDB.db().session filename = os.path.join(cls.sql_rules_path, filename) # Initial load sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers) sql_data = clean_data( sql_data, RuleSql, {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name', 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type', 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag', 'severity_name': 'severity_name'}, {} ) # Processing certain values sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name)) sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) if sql_data['file_id'].isnull().values.any(): raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}' .format(', '.join(list(FILE_TYPE_DICT.keys())))) sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag: flag in ('true', 't', 'y', 'yes')) sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name: RULE_SEVERITY_DICT.get(severity_name, None)) if sql_data['rule_severity_id'].isnull().values.any(): raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}' .format(', '.join(list(RULE_SEVERITY_DICT.keys())))) sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True) # Final check if we need to actually reload if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'], sort_cols=['rule_label', 'file_id', 'target_file_id']): # Delete and reload all records currently in table logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path)) sess.query(RuleSql).delete() insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection()) sess.commit() else: logger.info('No changes detected since last load. Skipping.')
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_object_class.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } filename = os.path.join(base_path, 'object_class.csv') try: # Update file from public S3 bucket object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url']) r = requests.get(object_class_url, allow_redirects=True) open(filename, 'wb').write(r.content) except Exception: pass # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session metrics_json['records_deleted'] = sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"}, {"object_class_code": {"pad_to_length": 3}} ) metrics_json['records_received'] = len(data.index) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num update_external_data_load_date(now, datetime.datetime.now(), 'object_class') metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_object_class_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def update_state_congr_table_census(census_file, sess): logger.info( "Adding congressional districtions from census to the state_congressional table" ) data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, { "state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year" }, {'congressional_district_no': { "pad_to_length": 2 }}) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def create_temp_duns_table(sess, table_name, data): """ Creates a temporary duns table with the given name and data. Args: sess: database connection table_name: what to name the table being created data: pandas dataframe representing duns data """ logger.info('Making {} table'.format(table_name)) create_table_sql = """ CREATE TABLE IF NOT EXISTS {} ( created_at TIMESTAMP WITHOUT TIME ZONE, updated_at TIMESTAMP WITHOUT TIME ZONE, awardee_or_recipient_uniqu TEXT, activation_date DATE, expiration_date DATE, deactivation_date DATE, registration_date DATE, last_sam_mod_date DATE, legal_business_name TEXT, dba_name TEXT, ultimate_parent_unique_ide TEXT, ultimate_parent_legal_enti TEXT, address_line_1 TEXT, address_line_2 TEXT, city TEXT, state TEXT, zip TEXT, zip4 TEXT, country_code TEXT, congressional_district TEXT, business_types_codes TEXT[], business_types TEXT[], entity_structure TEXT ); """.format(table_name) sess.execute(create_table_sql) # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE {};'.format(table_name)) insert_dataframe(data, table_name, sess.connection())
def parse_city_file(city_file, sess): """ Parse the City file and insert all relevant rows into the database. Args: city_file: path/url to file to gather City data from sess: database session """ # read the data and clean up the column names data = pd.read_csv(city_file, dtype=str, sep="|") data = clean_data( data, { "FEATURE_NAME": "feature_name", "FEATURE_CLASS": "feature_class", "CENSUS_CODE": "city_code", "STATE_ALPHA": "state_code", "COUNTY_NUMERIC": "county_number", "COUNTY_NAME": "county_name", "PRIMARY_LATITUDE": "latitude", "PRIMARY_LONGITUDE": "longitude" }) # add a sort column based on feature_class and remove anything with a different feature class or empty city_code feature_class_ranking = { "Populated Place": 1, "Locale": 2, "Civil": 3, "Census": 4 } data = data[pd.notnull(data['city_code'])] data['sorting_col'] = data['feature_class'].map(feature_class_ranking) data = data[pd.notnull(data['sorting_col'])] # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence # because we've sorted by priority so the one that would overwrite the others is on top already) data = data.sort_values(by=['sorting_col']) data = data[~data. duplicated(subset=['state_code', 'city_code'], keep='first')] data = data.drop('sorting_col', axis=1) # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # just sorting it how it started out data = data.sort_values(by=['feature_name']) # insert data into table num = insert_dataframe(data, CityCode.__table__.name, sess.connection()) logger.info('{} records inserted to city_code'.format(num)) sess.commit()
def parse_fabs_file(f, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list): logger.info("starting file " + str(f.name)) csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0] zfile = zipfile.ZipFile(f.name) data = pd.read_csv( zfile.open(csv_file), dtype=str, usecols=[ 'cfda_program_num', 'sai_number', 'recipient_name', 'recipient_city_code', 'recipient_city_name', 'recipient_county_code', 'recipient_county_name', 'recipient_zip', 'recipient_type', 'action_type', 'agency_code', 'federal_award_id', 'federal_award_mod', 'fed_funding_amount', 'non_fed_funding_amount', 'total_funding_amount', 'obligation_action_date', 'starting_date', 'ending_date', 'assistance_type', 'record_type', 'correction_late_ind', 'fyq_correction', 'principal_place_code', 'principal_place_state', 'principal_place_cc', 'principal_place_country_code', 'principal_place_zip', 'principal_place_cd', 'cfda_program_title', 'project_description', 'duns_no', 'receip_addr1', 'receip_addr2', 'receip_addr3', 'face_loan_guran', 'orig_sub_guran', 'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri', 'recipient_state_code', 'last_modified_date' ]) clean_data = format_fabs_data(data, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list) if clean_data is not None: logger.info("loading {} rows".format(len(clean_data.index))) insert_dataframe(clean_data, PublishedAwardFinancialAssistance.__table__.name, sess.connection()) sess.commit()
def parse_fabs_file(f, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list): logger.info("starting file " + str(f.name)) csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0] zfile = zipfile.ZipFile(f.name) data = pd.read_csv(zfile.open(csv_file), dtype=str, usecols=[ 'cfda_program_num', 'sai_number', 'recipient_name', 'recipient_city_code', 'recipient_city_name', 'recipient_county_code', 'recipient_county_name', 'recipient_zip', 'recipient_type', 'action_type', 'agency_code', 'federal_award_id', 'federal_award_mod', 'fed_funding_amount', 'non_fed_funding_amount', 'total_funding_amount', 'obligation_action_date', 'starting_date', 'ending_date', 'assistance_type', 'record_type', 'correction_late_ind', 'fyq_correction', 'principal_place_code', 'principal_place_state', 'principal_place_cc', 'principal_place_country_code', 'principal_place_zip', 'principal_place_cd', 'cfda_program_title', 'project_description', 'duns_no', 'receip_addr1', 'receip_addr2', 'receip_addr3', 'face_loan_guran', 'orig_sub_guran', 'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri', 'recipient_state_code', 'last_modified_date' ]) clean_data = format_fabs_data(data, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list) if clean_data is not None: logger.info("loading {} rows".format(len(clean_data.index))) insert_dataframe(clean_data, PublishedAwardFinancialAssistance.__table__.name, sess.connection()) sess.commit()
def update_state_congr_table_census(census_file, sess): """ Update contents of state_congressional table to include districts from the census Args: census_file: file path/url to the census file to read sess: the database connection """ logger.info("Adding congressional districts from census to the state_congressional table") data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, {"state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year"}, {'congressional_district_no': {"pad_to_length": 2}} ) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def load_defc(force_reload=False): """ Loads the DEFC data. Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'def_codes.csv') try: # Update file from public S3 bucket def_codes_url = '{}/def_codes.csv'.format( CONFIG_BROKER['usas_public_reference_url']) r = requests.get(def_codes_url, allow_redirects=True) open(defc_file, 'wb').write(r.content) except Exception: pass logger.info('Loading defc data') with create_app().app_context(): data = pd.read_csv(defc_file, dtype=str) # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly data = data[data['Is Valid'] == 'true'] data = clean_data(data, DEFC, { 'defc': 'code', 'group_name': 'group' }, {}) diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code']) if force_reload or diff_found: sess = GlobalDB.db().session # delete any data in the DEFC table sess.query(DEFC).delete() # insert data into table num = insert_dataframe(data, DEFC.__table__.name, sess.connection()) logger.info('{} records inserted to defc'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'defc') else: logger.info('No differences found, skipping defc table reload.')
def load_submission_window_schedule(): """ Loads the submission window schedule data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) sub_schedule_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "submission_window_schedule.csv" }, ExpiresIn=600) else: sub_schedule_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'submission_window_schedule.csv') logger.info('Loading submission window schedule data') with create_app().app_context(): data = pd.read_csv(sub_schedule_file, dtype=str) data = clean_data( data, SubmissionWindowSchedule, { 'year': 'year', 'period': 'period', 'period_start': 'period_start', 'publish_deadline': 'publish_deadline', 'certification_deadline': 'certification_deadline' }, {}) # Add a day to the deadlines because the dates in the file are supposed to be inclusive data['publish_deadline'] = data.apply( lambda x: add_day(x, 'publish_deadline'), axis=1) data['certification_deadline'] = data.apply( lambda x: add_day(x, 'certification_deadline'), axis=1) sess = GlobalDB.db().session # delete any data in the SubmissionWindowSchedule table sess.query(SubmissionWindowSchedule).delete() # insert data into table num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name, sess.connection()) logger.info( '{} records inserted to submission_window_schedule'.format(num)) sess.commit()
def parse_city_file(city_file, sess): """ Parse the City file and insert all relevant rows into the database. Args: city_file: path/url to file to gather City data from sess: database session """ # read the data and clean up the column names data = pd.read_csv(city_file, dtype=str, sep="|") data = clean_data( data, {"FEATURE_NAME": "feature_name", "FEATURE_CLASS": "feature_class", "CENSUS_CODE": "city_code", "STATE_ALPHA": "state_code", "COUNTY_NUMERIC": "county_number", "COUNTY_NAME": "county_name", "PRIMARY_LATITUDE": "latitude", "PRIMARY_LONGITUDE": "longitude"}) # add a sort column based on feature_class and remove anything with a different feature class or empty city_code feature_class_ranking = {"Populated Place": 1, "Locale": 2, "Civil": 3, "Census": 4} data = data[pd.notnull(data['city_code'])] data['sorting_col'] = data['feature_class'].map(feature_class_ranking) data = data[pd.notnull(data['sorting_col'])] # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence # because we've sorted by priority so the one that would overwrite the others is on top already) data = data.sort_values(by=['sorting_col']) data = data[~data.duplicated(subset=['state_code', 'city_code'], keep='first')] data = data.drop('sorting_col', axis=1) # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # just sorting it how it started out data = data.sort_values(by=['feature_name']) # insert data into table num = insert_dataframe(data, CityCode.__table__.name, sess.connection()) logger.info('{} records inserted to city_code'.format(num)) sess.commit()
def parse_state_file(state_file, sess): """ Parse the State file and insert all relevant rows into the database. Args: state_file: path/url to file to gather State data from sess: database session """ # read the data. Cleaning is in there in case something changes, doesn't really do anything now data = pd.read_csv(state_file, dtype=str) data = clean_data( data, {"state_name": "state_name", "state_code": "state_code", "fips_code": "fips_code"}) # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # insert data into table num = insert_dataframe(data, States.__table__.name, sess.connection()) logger.info('{} records inserted to states'.format(num)) sess.commit()
def load_country_codes(base_path): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "country_codes.csv"}, ExpiresIn=600) else: filename = os.path.join(base_path, "country_codes.csv") logger.info('Loading country codes file: country_codes.csv') with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values sess.query(CountryCode).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, CountryCode, {"country_code": "country_code", "country_name": "country_name"}, {} ) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) # insert to db table_name = CountryCode.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def parse_state_file(state_file, sess): """ Parse the State file and insert all relevant rows into the database. Args: state_file: path/url to file to gather State data from sess: database session """ # read the data. Cleaning is in there in case something changes, doesn't really do anything now data = pd.read_csv(state_file, dtype=str) data = clean_data( data, { "state_name": "state_name", "state_code": "state_code", "fips_code": "fips_code" }) # add created_at and updated_at columns now = datetime.utcnow() data = data.assign(created_at=now, updated_at=now) # insert data into table num = insert_dataframe(data, States.__table__.name, sess.connection()) logger.info('{} records inserted to states'.format(num)) sess.commit()
def process_file_chunk(sess, data, certified_table, job, submission_id, file_type_id, rename_cols, col_mapping, all_cols, row_offset, float_cols): """ Load in a chunk of award data from updated submissions Args: sess: the database connection data: the chunked dataframe certified_table: the certified table to copy to job: the certified validation job associated with the file type submission_id: the submission associated with the file file_type_id: the file type id associated with the file rename_cols: mapping of columns that have been renamed over time col_mapping: mapping of either daims name or long name to the short names all_cols: all the schema columns and deleted columns over time row_offset: with the chunking, indicates the row starting point in the file float_cols: columns that are floats (to remove the commas) Returns: updated row_offset to be reused """ # Only use the columns needed for the DB table if data.empty: logger.info('Empty file for submission {}, {} file. Skipping'.format( submission_id, FILE_TYPE_DICT_ID[file_type_id])) return # Renaming columns to short db names regardless of how old the files are data = data.rename(columns=lambda x: x.lower().strip()) data = data.rename(index=str, columns=rename_cols) data = data.rename(index=str, columns=col_mapping) # If the file is missing new columns added over time, just set them to None blank_cols = list(set(all_cols) - set(list(data.columns))) logger.info('The following fields were not found in this chunk: {}'.format( blank_cols)) data = data.reindex(columns=list(data.columns) + blank_cols) # Keep only what we need from the schema + any deleted columns data = data[[col for col in all_cols if col in data.columns]] # Clean rows if len(data.index) > 0: data = data.applymap(clean_col) for field in [col for col in list(data.columns) if col in float_cols]: data[field] = data[field].apply(lambda x: x.replace(',', '') if x else None) # Populate columns that aren't in the file now = datetime.datetime.now() data['created_at'] = now data['updated_at'] = now data['submission_id'] = submission_id data['job_id'] = job.job_id data = data.reset_index() original_row_offset = row_offset data['row_number'] = row_offset + data.index + 2 row_offset += CHUNK_SIZE data = data.drop(['index'], axis=1) logger.info( 'Moving chunk data for submission {}, {} file, starting from row {}'. format(submission_id, FILE_TYPE_DICT_ID[file_type_id], original_row_offset + 2)) # Process and insert the data insert_dataframe(data, certified_table.__table__.name, sess.connection()) sess.commit() return row_offset
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False, table=DUNS, year=None): """ Takes in a SAM file and adds the DUNS data to the database Args: file_path: the path to the SAM file sess: the database connection monthly: whether it's a monthly file benchmarks: whether to log times table: the table to work from (could be DUNS/HistoricParentDuns) year: the year associated with the data (primarily for HistoricParentDUNS loads) """ parse_start_time = time.time() logger.info("Starting file " + str(file_path)) dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat' sam_file_type = "MONTHLY" if monthly else "DAILY" dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0] with create_app().app_context(): column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract_code": 4, "registration_date": 6, "expiration_date": 7, "last_sam_mod_date": 8, "activation_date": 9, "legal_business_name": 10, "dba_name": 11, "address_line_1": 14, "address_line_2": 15, "city": 16, "state": 17, "zip": 18, "zip4": 19, "country_code": 20, "congressional_district": 21, "entity_structure": 27, "business_types_raw": 31, "ultimate_parent_legal_enti": 186, "ultimate_parent_unique_ide": 187 } column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) # Initial sweep of the file to see rows and possibly what DUNS we're updating if benchmarks: initial_sweep = time.time() nrows = 0 with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: nrows = len(dat_file.readlines()) if benchmarks: logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep)) block_size = 10000 batches = (nrows-1)//block_size # skip the first line again if the last batch is also the first batch skiplastrows = 2 if batches == 0 else 1 last_block_size = ((nrows % block_size) or block_size)-skiplastrows batch = 0 added_rows = 0 while batch <= batches: skiprows = 1 if batch == 0 else (batch*block_size) nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows) with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys(), quoting=3) # add deactivation_date column for delete records lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan])) csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date') if monthly else csv_data["sam_extract_code"].apply(lambda_func)) # convert business types string to array bt_func = (lambda bt_raw: pd.Series([[str(code) for code in str(bt_raw).split('~') if isinstance(bt_raw, str)]])) csv_data = csv_data.assign(business_types_codes=csv_data["business_types_raw"].apply(bt_func)) del csv_data["business_types_raw"] # removing rows where DUNS number isn't even provided csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull()) # cleaning and replacing NaN/NaT with None's csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None), table=table) if monthly: logger.info("Adding all monthly data with bulk load") if benchmarks: bulk_month_load = time.time() del csv_data["sam_extract_code"] if year: csv_data['year'] = year insert_dataframe(csv_data, table.__table__.name, sess.connection()) if benchmarks: logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load)) else: add_data = csv_data[csv_data.sam_extract_code == '2'] update_delete_data = csv_data[(csv_data.sam_extract_code == '3') | (csv_data.sam_extract_code == '1')] for dataframe in [add_data, update_delete_data]: del dataframe["sam_extract_code"] if not add_data.empty: try: logger.info("Attempting to bulk load add data") insert_dataframe(add_data, table.__table__.name, sess.connection()) except IntegrityError: logger.info("Bulk loading add data failed, loading add data by row") sess.rollback() models, activated_models = get_relevant_models(add_data, sess, benchmarks=benchmarks) logger.info("Loading add data ({} rows)".format(len(add_data.index))) load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks, table=table) if not update_delete_data.empty: models, activated_models = get_relevant_models(update_delete_data, sess, benchmarks=benchmarks) logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index))) load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks, table=table) sess.commit() added_rows += nrows batch += 1 logger.info('%s DUNS records inserted', added_rows) if benchmarks: logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time, added_rows))
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) """Load country code lookup table.""" model = CFDAProgram def fix_program_number(n, decimals=3): multiplier = 10 ** decimals value = math.floor(n * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session now = datetime.utcnow() import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) import_dataframe = import_data.copy(deep=True) # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1 import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now) table_name = model.__table__.name current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False) # Now we need to overwrite the db's audit dates in the created dataframe, and # also set all the pks to 1, so they match current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now) # pandas comparison requires everything to be in the same order current_data.sort_values('program_number', inplace=True) import_dataframe.sort_values('program_number', inplace=True) # columns too cols = import_dataframe.columns.tolist() cols.sort() import_dataframe = import_dataframe[cols] cols = current_data.columns.tolist() cols.sort() current_data = current_data[cols] # need to reset the indexes now that we've done all this sorting, so that they match import_dataframe.reset_index(drop=True, inplace=True) current_data.reset_index(drop=True, inplace=True) # My favorite part: When pandas pulls the data out of postgres, the program_number column # is a Decimal. However, in adding it to the dataframe, this column loses precision. # So for example, a program number of 10.001 imports into the dataframe as 10.000999999999999. # It also needs to be cast to astring, and padded with the right number of zeroes, as needed. current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x)) # Finally, you can execute this and get True back if the data truly has not changed from the last # time the CSV was loaded. new_data = not import_dataframe.equals(current_data) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) else: logger.info("Skipped cfda load, no new data.") sys.exit(3)
def load_program_activity_data(base_path, force_reload=False, export=False): """ Load program activity lookup table. Args: base_path: directory of domain config files force_reload: whether or not to force a reload export: whether or not to export a public copy of the file """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_program_activity.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'invalid_records_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } dropped_count = 0 logger.info('Checking PA upload dates to see if we can skip.') last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()) and not force_reload: logger.info('Skipping load as it\'s already been done') else: logger.info('Getting the progrma activity file') program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: {}'.format(PA_FILE_NAME)) with create_app().app_context(): sess = GlobalDB.db().session try: raw_data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError: log_blank_file() exit_if_nonlocal(4) # exit code chosen arbitrarily, to indicate distinct failure states return headers = set([header.upper() for header in list(raw_data)]) if not VALID_HEADERS.issubset(headers): logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS)) exit_if_nonlocal(4) return try: dropped_count, data = clean_data( raw_data, ProgramActivity, {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id', 'account_code': 'account_number', 'pa_code': 'program_activity_code', 'pa_title': 'program_activity_name'}, {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3}, 'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True}, 'account_number': {'pad_to_length': 4}}, ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'], True ) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() exit_if_nonlocal(4) return else: logger.error('Loading of program activity file failed due to exceeded failure threshold. ' 'Application tried to drop {} rows'.format(e.count)) exit_if_nonlocal(5) return metrics_json['records_deleted'] = sess.query(ProgramActivity).delete() metrics_json['invalid_records_dropped'] = dropped_count # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x)) # Convert FYQ to FYP data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x)) # because we're only loading a subset of program activity info, there will be duplicate records in the # dataframe. this is ok, but need to de-duped before the db load. We also need to log them. base_count = len(data.index) metrics_json['records_received'] = base_count data.drop_duplicates(inplace=True) dupe_count = base_count - len(data.index) logger.info('Dropped {} duplicate rows.'.format(dupe_count)) metrics_json['duplicates_dropped'] = dupe_count # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() if export: export_public_pa(raw_data) end_time = datetime.datetime.now() update_external_data_load_date(now, end_time, 'program_activity') update_external_data_load_date(last_upload, end_time, 'program_activity_upload') logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(end_time - now) with open('load_program_activity_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) if dropped_count > 0: exit_if_nonlocal(3) return
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10 ** decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() # If we've updated the data at all, update the external data load date update_external_data_load_date(local_now, datetime.now(), 'cfda') if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def parse_sam_file(file, sess): logger.info("starting file " + str(file.name)) csv_file = os.path.splitext(os.path.basename(file.name))[0] + '.dat' zfile = zipfile.ZipFile(file.name) # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype nrows = 0 with zfile.open(csv_file) as f: nrows = len(f.readlines()) - 2 # subtract the header and footer column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract": 4, "expiration_date": 7, "activation_date": 9, "ultimate_parent_legal_enti": 10, "ultimate_parent_unique_ide": 48, "exec_comp_str": 89 } column_header_mapping_ordered = OrderedDict( sorted(column_header_mapping.items(), key=lambda c: c[1])) csv_data = pd.read_csv(zfile.open(csv_file), dtype=str, header=None, skiprows=1, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys()) total_data = csv_data.copy() # skipping when sam_extract == '4' as it's expired total_data = total_data[total_data.sam_extract != '4'] # parse out executive compensation from row 90 lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values()))) parsed_data = total_data["exec_comp_str"].apply(lambda_func) parsed_data.columns = list(parse_exec_comp().keys()) del total_data["exec_comp_str"] total_data = total_data.join(parsed_data) # split into 3 dataframes based on row 8 ('1', '2', '3') delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan, "", regex=True) add_data = total_data[total_data.sam_extract == '2'].replace(np.nan, "", regex=True) update_data = total_data[total_data.sam_extract == '3'].replace(np.nan, "", regex=True) for dataframe in [add_data, update_data, delete_data, total_data]: del dataframe["sam_extract"] table_name = ExecutiveCompensation.__table__.name insert_dataframe(add_data, table_name, sess.connection()) for _, row in update_data.iterrows(): sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\ update(row, synchronize_session=False) for _, row in delete_data.iterrows(): sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\ delete(synchronize_session=False) sess.commit()