def load_state_data(force_reload): """ Load data into the States table Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url']) with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file: new_data = parse_state_file(state_file) diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info('Differences found or reload forced, reloading states table.') # delete any data in the States table sess.query(States).delete() # insert data into table num = insert_dataframe(new_data, States.__table__.name, sess.connection()) logger.info('{} records inserted to states'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'state_code') else: logger.info('No differences found, skipping states table reload.')
def load_agency_data(base_path, force_reload=False): """ Load agency data into the database Args: base_path: directory that contains the agency files force_reload: whether to reload regardless """ start_time = datetime.now() agency_codes_url = '{}/agency_codes.csv'.format(CONFIG_BROKER['usas_public_reference_url']) logger.info('Loading agency codes file from {}'.format(agency_codes_url)) # Get data from public S3 bucket r = requests.get(agency_codes_url, allow_redirects=True) agency_codes_file = os.path.join(base_path, 'agency_codes.csv') open(agency_codes_file, 'wb').write(r.content) # Parse data with create_app().app_context(): logger.info('Loading CGAC') new_cgac = load_cgac(agency_codes_file, force_reload=force_reload) logger.info('Loading FREC') new_frec = load_frec(agency_codes_file, force_reload=force_reload) logger.info('Loading Sub Tier Agencies') new_sub_tier = load_sub_tier_agencies(agency_codes_file, force_reload=force_reload) # If we've updated the data at all, update the external data load date if new_cgac or new_frec or new_sub_tier: update_external_data_load_date(start_time, datetime.now(), 'agency') # Delete file once we're done os.remove(agency_codes_file)
def test_set_get_pa_last_upload_existing(monkeypatch, database): """ Test the last upload date/time retrieval """ monkeypatch.setattr(load_program_activity, 'CONFIG_BROKER', {'use_aws': False}) add_relevant_data_types(database.session) # test epoch timing stored_date = load_program_activity.get_stored_pa_last_upload() expected_date = datetime.datetime(1970, 1, 1, 0, 0, 0) assert stored_date == expected_date update_external_data_load_date(datetime.datetime(2017, 12, 31, 0, 0, 0), datetime.datetime(2017, 12, 31, 0, 0, 0), 'program_activity_upload') stored_date = load_program_activity.get_stored_pa_last_upload() expected_date = datetime.datetime(2017, 12, 31, 0, 0, 0) assert stored_date == expected_date # repeat this, because the first time, there is no stored object, but now test with one that already exists. update_external_data_load_date(datetime.datetime(2016, 12, 31, 0, 0, 0), datetime.datetime(2016, 12, 31, 0, 0, 0), 'program_activity_upload') stored_date = load_program_activity.get_stored_pa_last_upload() expected_date = datetime.datetime(2016, 12, 31, 0, 0, 0) assert stored_date == expected_date remove_metrics_file()
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_object_class.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } filename = os.path.join(base_path, 'object_class.csv') try: # Update file from public S3 bucket object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url']) r = requests.get(object_class_url, allow_redirects=True) open(filename, 'wb').write(r.content) except Exception: pass # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session metrics_json['records_deleted'] = sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"}, {"object_class_code": {"pad_to_length": 3}} ) metrics_json['records_received'] = len(data.index) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num update_external_data_load_date(now, datetime.datetime.now(), 'object_class') metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_object_class_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_all_sf133(sf133_path=None, force_sf133_load=False, aws_prefix='sf_133', fix_links=True, update_tas_fields=True): """ Load any SF-133 files that are not yet in the database and fix any broken links Args: sf133_path: path to the SF133 files force_sf133_load: boolean to indicate whether to force a reload of the data aws_prefix: prefix to filter which files to pull from AWS fix_links: fix any SF133 records not linked to TAS data update_tas_fields: rederive SF133 records if the associated TAS record has been updated """ now = datetime.now() metrics_json = { 'script_name': 'load_sf133.py', 'start_time': str(now), 'records_deleted': 0, 'records_inserted': 0 } with create_app().app_context(): sess = GlobalDB.db().session # get a list of SF 133 files to load sf133_list = get_sf133_list(sf133_path, aws_prefix=aws_prefix) sf_re = re.compile(r'sf_133_(?P<year>\d{4})_(?P<period>\d{2})\.csv') for sf133 in sf133_list: # for each SF file, parse out fiscal year and period and call the SF 133 loader file_match = sf_re.match(sf133.file) if not file_match: logger.info('Skipping SF 133 file with invalid name: %s', sf133.full_file) continue logger.info('Starting %s...', sf133.full_file) load_sf133(sess, sf133.full_file, file_match.group('year'), file_match.group('period'), force_sf133_load=force_sf133_load, metrics=metrics_json) if update_tas_fields: rederive_tas_fields(sess, metrics=metrics_json) if fix_links: fix_broken_links(sess, metrics=metrics_json) update_external_data_load_date(now, datetime.now(), 'gtas') metrics_json['duration'] = str(datetime.now() - now) with open('load_sf133_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info("Script complete")
def load_defc(force_reload=False): """ Loads the DEFC data. Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'def_codes.csv') try: # Update file from public S3 bucket def_codes_url = '{}/def_codes.csv'.format( CONFIG_BROKER['usas_public_reference_url']) r = requests.get(def_codes_url, allow_redirects=True) open(defc_file, 'wb').write(r.content) except Exception: pass logger.info('Loading defc data') with create_app().app_context(): data = pd.read_csv(defc_file, dtype=str) # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly data = data[data['Is Valid'] == 'true'] data = clean_data(data, DEFC, { 'defc': 'code', 'group_name': 'group' }, {}) diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code']) if force_reload or diff_found: sess = GlobalDB.db().session # delete any data in the DEFC table sess.query(DEFC).delete() # insert data into table num = insert_dataframe(data, DEFC.__table__.name, sess.connection()) logger.info('{} records inserted to defc'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'defc') else: logger.info('No differences found, skipping defc table reload.')
def main(): now = datetime.now() parser = argparse.ArgumentParser( description='Pull data from the Federal Hierarchy API.') parser.add_argument('-a', '--all', help='Clear out the database and get historical data', action='store_true') parser.add_argument('-f', '--filename', help='Generate a local CSV file from the data.', nargs=1, type=str) parser.add_argument('-o', '--export_office', help='Export the current office table. ' 'Please provide the file name/path.', nargs=1, type=str) parser.add_argument('-d', '--pull_date', help='Date from which to start the pull', nargs=1, type=str) parser.add_argument('-i', '--ignore_db', help='Do not update the DB tables', action='store_true') args = parser.parse_args() if args.all and args.pull_date: logger.error('The -a and -d flags conflict, cannot use both at once.') sys.exit(1) metrics_json = { 'script_name': 'load_federal_hierarchy.py', 'start_time': str(now), 'level_1_records': 0, 'level_2_records': 0, 'level_3_records': 0, 'level_4_records': 0, 'level_5_records': 0, 'level_6_records': 0, 'level_7_records': 0, 'missing_cgacs': [], 'missing_subtier_codes': [] } # Handle the pull_date parameter updated_date_from = None if args.pull_date: try: updated_date_from = args.pull_date[0] datetime.strptime(updated_date_from, '%Y-%m-%d') except ValueError: logger.error('The date given to the -d flag was not parseable.') sys.exit(1) # Get or create the start date sess = GlobalDB.db().session if not args.all and not updated_date_from: last_pull_date = sess.query(ExternalDataLoadDate.last_load_date_start).\ filter_by(external_data_type_id=EXTERNAL_DATA_TYPE_DICT['office']).one_or_none() if not last_pull_date: logger.error( 'The -a or -d flag must be set when there is no latest run in the database.' ) sys.exit(1) # We want to make the date one day earlier to account for any timing weirdness between the two systems updated_date_from = last_pull_date[0].date() - timedelta(days=1) # Handle the filename parameter filename = args.filename[0] if args.filename else None # Handle the export office parameter export_office = args.export_office[0] if args.export_office else None # Handle a complete data reload if args.all and not args.ignore_db: logger.info('Emptying out the Office table for a complete reload.') sess.execute('''TRUNCATE TABLE office RESTART IDENTITY''') try: pull_offices(sess, filename, not args.ignore_db, args.all, updated_date_from, export_office, metrics_json) except Exception as e: logger.exception(e) sys.exit(1) # find if there were any new cgacs/subtiers added all_cgacs = [cgac.cgac_code for cgac in sess.query(CGAC.cgac_code)] all_frecs = [frec.frec_code for frec in sess.query(FREC.frec_code)] all_subtiers = [ subtier.sub_tier_agency_code for subtier in sess.query(SubTierAgency.sub_tier_agency_code) ] metrics_json['missing_cgacs'] = list( set(metrics_json['missing_cgacs']) - set(all_cgacs + all_frecs)) metrics_json['missing_subtier_codes'] = list( set(metrics_json['missing_subtier_codes']) - set(all_subtiers)) update_external_data_load_date(now, datetime.now(), 'office') metrics_json['duration'] = str(datetime.now() - now) with open('load_federal_hierarchy_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info('Script complete')
def load_program_activity_data(base_path, force_reload=False, export=False): """ Load program activity lookup table. Args: base_path: directory of domain config files force_reload: whether or not to force a reload export: whether or not to export a public copy of the file """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_program_activity.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'invalid_records_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } dropped_count = 0 logger.info('Checking PA upload dates to see if we can skip.') last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()) and not force_reload: logger.info('Skipping load as it\'s already been done') else: logger.info('Getting the progrma activity file') program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: {}'.format(PA_FILE_NAME)) with create_app().app_context(): sess = GlobalDB.db().session try: raw_data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError: log_blank_file() exit_if_nonlocal(4) # exit code chosen arbitrarily, to indicate distinct failure states return headers = set([header.upper() for header in list(raw_data)]) if not VALID_HEADERS.issubset(headers): logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS)) exit_if_nonlocal(4) return try: dropped_count, data = clean_data( raw_data, ProgramActivity, {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id', 'account_code': 'account_number', 'pa_code': 'program_activity_code', 'pa_title': 'program_activity_name'}, {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3}, 'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True}, 'account_number': {'pad_to_length': 4}}, ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'], True ) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() exit_if_nonlocal(4) return else: logger.error('Loading of program activity file failed due to exceeded failure threshold. ' 'Application tried to drop {} rows'.format(e.count)) exit_if_nonlocal(5) return metrics_json['records_deleted'] = sess.query(ProgramActivity).delete() metrics_json['invalid_records_dropped'] = dropped_count # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x)) # Convert FYQ to FYP data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x)) # because we're only loading a subset of program activity info, there will be duplicate records in the # dataframe. this is ok, but need to de-duped before the db load. We also need to log them. base_count = len(data.index) metrics_json['records_received'] = base_count data.drop_duplicates(inplace=True) dupe_count = base_count - len(data.index) logger.info('Dropped {} duplicate rows.'.format(dupe_count)) metrics_json['duplicates_dropped'] = dupe_count # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() if export: export_public_pa(raw_data) end_time = datetime.datetime.now() update_external_data_load_date(now, end_time, 'program_activity') update_external_data_load_date(last_upload, end_time, 'program_activity_upload') logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(end_time - now) with open('load_program_activity_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) if dropped_count > 0: exit_if_nonlocal(3) return
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10 ** decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() # If we've updated the data at all, update the external data load date update_external_data_load_date(local_now, datetime.now(), 'cfda') if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_tas(backfill_historic=False): """ Load TAS file into broker database. Args: backfill_historic: if set to true, this will only update certain columns if budget_function_code is null """ # read TAS file to dataframe, to make sure all is well with the file before firing up a db transaction tas_files = [] file_loaded = False now = datetime.now() metrics_json = { 'script_name': 'load_tas.py', 'start_time': str(now), 'records_updated': 0, 'records_added': 0, 'records_provided': 0, 'duplicates_dropped': 0 } if CONFIG_BROKER["use_aws"]: # Storing version dictionaries in the list to prevent getting all the links at once and possibly work with # expired AWS links s3connection = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) # list_object_versions returns the versions in reverse chronological order if not backfill_historic: # get the latest tas_file tas_files = [ s3connection.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': 'cars_tas.csv' }, ExpiresIn=600) ] else: tas_files = s3connection.list_object_versions( Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix='cars_tas.csv')['Versions'] # getting the latest file (see the reversed) from each day for performance and accuracy tas_files_grouped = { tas_file['LastModified'].date(): tas_file for tas_file in reversed(tas_files) } # sorting them back to chronological order tas_files = sorted( [tas_file for date, tas_file in tas_files_grouped.items()], key=lambda k: k['LastModified']) elif backfill_historic: logger.error( 'Unable to attain historical versions of cars_tas without aws access.' ) return else: tas_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "cars_tas.csv") tas_files.append(tas_file) sess = GlobalDB.db().session for tas_file in reversed(tas_files): update_missing = missing_records(sess) if backfill_historic else [] if backfill_historic and not update_missing: # no more missing, we're done here break if CONFIG_BROKER["use_aws"] and backfill_historic: # generate url from the version dict logger.info('Working with remote cars_tas.csv (from {})'.format( tas_file['LastModified'])) tas_file = s3connection.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': 'cars_tas.csv', 'VersionId': tas_file['VersionId'] }, ExpiresIn=600) elif CONFIG_BROKER["use_aws"]: logger.info('Working with latest remote cars_tas.csv') else: logger.info('Working with local cars_tas.csv') update_tas_lookups(sess, tas_file, update_missing=update_missing, metrics=metrics_json) file_loaded = True metrics_json['duration'] = str(datetime.now() - now) if file_loaded: update_external_data_load_date(now, datetime.now(), 'tas') with open('load_tas_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_country_codes(base_path, force_reload=False): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. force_reload: boolean to determine if reload should happen whether there are differences or not """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_country_codes.py', 'start_time': str(now), 'records_deleted': 0, 'records_provided': 0, 'duplicates_dropped': 0, 'records_inserted': 0 } with create_app().app_context(): sess = GlobalDB.db().session country_code_file = '{}/country_codes.csv'.format( CONFIG_BROKER['usas_public_reference_url']) logger.info( 'Loading country codes file from {}'.format(country_code_file)) # Get data from public S3 bucket r = requests.get(country_code_file, allow_redirects=True) filename = os.path.join(base_path, 'country_codes.csv') open(filename, 'wb').write(r.content) # Parse data data = pd.read_csv(filename, dtype=str) metrics_json['records_provided'] = len(data.index) data = clean_data( data, CountryCode, { 'country_code': 'country_code', 'country_name': 'country_name', '2_char_country_code': 'country_code_2_char', 'territory_or_freely_associated_state': 'territory_free_state' }, {}) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json[ 'records_provided'] - len(data.index) # compare to existing content in table diff_found = check_dataframe_diff(data, CountryCode, ['country_code_id'], ['country_code'], lambda_funcs=[ ('territory_free_state', convert_territory_bool_to_str) ]) # insert to db if reload required if force_reload or diff_found: logger.info( 'Differences found or reload forced, reloading country_code table.' ) # if there's a difference, clear out the old data before adding the new stuff metrics_json['records_deleted'] = sess.query(CountryCode).delete() num = insert_dataframe(data, CountryCode.__table__.name, sess.connection()) metrics_json['records_inserted'] = num sess.commit() # Updating data load dates if the load successfully added new country codes update_external_data_load_date(now, datetime.datetime.now(), 'country_code') logger.info( '{} records inserted to country_code table'.format(num)) else: logger.info( 'No differences found, skipping country_code table reload.') # Delete file once we're done os.remove(filename) metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_country_codes_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info('Script complete')
logger.info('Populating subaward table based off new data') new_procurements = (SERVICE_MODEL[PROCUREMENT].next_id(sess) > original_min_procurement_id) new_grants = (SERVICE_MODEL[GRANT].next_id(sess) > original_min_grant_id) proc_ids = list(set(updated_proc_internal_ids)) grant_ids = list(set(updated_grant_internal_ids)) if len(sys.argv) <= 1: if new_procurements: sess.query(Subaward).filter(Subaward.internal_id.in_(proc_ids)).delete(synchronize_session=False) populate_subaward_table(sess, PROCUREMENT, min_id=original_min_procurement_id) if new_grants: sess.query(Subaward).filter(Subaward.internal_id.in_(grant_ids)).delete(synchronize_session=False) populate_subaward_table(sess, GRANT, min_id=original_min_grant_id) elif args.procurement and new_procurements and args.ids: sess.query(Subaward).filter(Subaward.internal_id.in_(proc_ids)).delete(synchronize_session=False) populate_subaward_table(sess, PROCUREMENT, ids=args.ids) elif args.grants and new_grants and args.ids: sess.query(Subaward).filter(Subaward.internal_id.in_(grant_ids)).delete(synchronize_session=False) populate_subaward_table(sess, GRANT, ids=args.ids) # Deletes state mapping variable config_state_mappings() # Only change the last run date if we aren't doing specific ids to backfill if not args.ids: update_external_data_load_date(now, datetime.datetime.now(), 'subaward') metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_fsrs_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
'updates_received': 0, 'deletes_received': 0, 'added_uei': [], 'updated_uei': [], 'records_added': 0, 'records_updated': 0, 'parent_rows_updated': 0, 'parent_update_date': None } with create_app().app_context(): sess = GlobalDB.db().session if data_type in ('duns', 'both'): start_time = datetime.datetime.now() load_from_sam('DUNS', sess, historic, local, metrics=metrics, reload_date=reload_date) update_external_data_load_date(start_time, datetime.datetime.now(), 'recipient') if data_type in ('exec_comp', 'both'): start_time = datetime.datetime.now() load_from_sam('Executive Compensation', sess, historic, local, metrics=metrics, reload_date=reload_date) update_external_data_load_date(start_time, datetime.datetime.now(), 'executive_compensation') sess.close() metrics['records_added'] = len(set(metrics['added_uei'])) metrics['records_updated'] = len(set(metrics['updated_uei']) - set(metrics['added_uei'])) del metrics['added_uei'] del metrics['updated_uei'] logger.info('Added {} records and updated {} records'.format(metrics['records_added'], metrics['records_updated'])) metrics['duration'] = str(datetime.datetime.now() - now) with open('load_duns_exec_comp_metrics.json', 'w+') as metrics_file:
def read_zips(): """ Update zip codes in the zips table. """ with create_app().app_context(): start_time = datetime.now() sess = GlobalDB.db().session # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table sess.execute( 'CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);') sess.execute( 'CREATE TABLE IF NOT EXISTS temp_zips_grouped (LIKE zips_grouped INCLUDING ALL);' ) # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE temp_zips;') sess.execute('TRUNCATE TABLE temp_zips_grouped;') # Resetting the pk sequence sess.execute('SELECT setval(\'zips_zips_id_seq\', 1, false);') sess.execute( 'SELECT setval(\'zips_grouped_zips_grouped_id_seq\', 1, false);') sess.commit() if CONFIG_BROKER["use_aws"]: zip_folder = CONFIG_BROKER["zip_folder"] + "/" s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) response = s3_client.list_objects_v2( Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder) for obj in response.get('Contents', []): if obj['Key'] != zip_folder: zip_4_file_path = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': obj['Key'] }, ExpiresIn=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt" }, ExpiresIn=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [ f for f in os.listdir(base_path) if not re.match('^\.', f) ] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) group_zips(sess) hot_swap_zip_tables(sess) update_external_data_load_date(start_time, datetime.now(), 'zip_code') update_state_congr_table_current(sess) if CONFIG_BROKER["use_aws"]: census_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "census_congressional_districts.csv" }, ExpiresIn=600) else: census_file = os.path.join(base_path, "census_congressional_districts.csv") update_state_congr_table_census(census_file, sess) if CONFIG_BROKER['use_aws']: export_state_congr_table(sess) update_external_data_load_date(start_time, datetime.now(), 'congressional_district') logger.info("Zipcode script complete")