def generate_d_file(self): """ Write file D1 or D2 to an appropriate CSV. """ log_data = {'message': 'Starting file {} generation'.format(self.job.file_type.letter_name), 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code, 'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date, 'end_date': self.job.end_date, 'filename': self.job.original_filename} if self.job.submission_id: log_data['submission_id'] = self.job.submission_id logger.info(log_data) # Get or create a FileRequest for this generation current_date = datetime.now().date() file_request_params = { "job_id": self.job.job_id, "is_cached_file": True, "start_date": self.job.start_date, "end_date": self.job.end_date, "agency_code": self.agency_code, "file_type": self.job.file_type.letter_name, "agency_type": self.agency_type } file_request = self.sess.query(FileRequest).filter_by(**file_request_params).one_or_none() if not file_request: file_request_params["request_date"] = current_date file_request = FileRequest(**file_request_params) self.sess.add(file_request) self.sess.commit() # Mark this Job as not from-cache, and mark the FileRequest as the cached version (requested today) self.job.from_cached = False file_request.is_cached_file = True file_request.request_date = current_date self.sess.commit() # Prepare file data file_utils = fileD1 if self.job.file_type.letter_name == 'D1' else fileD2 local_file = "".join([CONFIG_BROKER['d_file_storage_path'], self.job.original_filename]) headers = [key for key in file_utils.mapping] query_utils = {"file_utils": file_utils, "agency_code": self.agency_code, "agency_type": self.agency_type, "start": self.job.start_date, "end": self.job.end_date, "sess": self.sess} # Generate the file and put in S3 write_query_to_file(local_file, self.job.filename, headers, self.job.file_type.letter_name, self.is_local, d_file_query, query_utils) log_data['message'] = 'Finished writing to file: {}'.format(self.job.original_filename) logger.info(log_data)
def generate_a_file(self): """ Write file A to an appropriate CSV. """ log_data = {'message': 'Starting file A generation', 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code, 'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date, 'end_date': self.job.end_date, 'filename': self.job.original_filename} logger.info(log_data) local_file = "".join([CONFIG_BROKER['d_file_storage_path'], self.job.original_filename]) headers = [key for key in fileA.mapping] # add 3 months to account for fiscal year period_date = self.job.end_date + relativedelta(months=3) query_utils = {"agency_code": self.agency_code, "period": period_date.month, "year": period_date.year, "sess": self.sess} # Generate the file and put in S3 write_query_to_file(local_file, self.job.filename, headers, self.job.file_type.letter_name, self.is_local, a_file_query, query_utils) log_data['message'] = 'Finished writing to file: {}'.format(self.job.original_filename) logger.info(log_data)
def generate_f_file(self): """ Write rows from fileF.generate_f_rows to an appropriate CSV. """ log_data = { 'message': 'Starting file F generation', 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'submission_id': self.job.submission_id, 'file_type': 'sub_award' } logger.info(log_data) f_file_contracts_query, f_file_grants_query = fileF.generate_f_file_queries( self.job.submission_id) # writing locally first without uploading log_data['message'] = 'Writing F file contracts to CSV: {}'.format( self.job.original_filename) logger.info(log_data) local_f_file = self.job.filename if self.is_local else self.job.original_filename write_query_to_file(self.sess, f_file_contracts_query, local_f_file, generate_headers=True, generate_string=False) # writing locally again but then uploading log_data['message'] = 'Writing F file grants to CSV: {}'.format( self.job.original_filename) logger.info(log_data) write_stream_query(self.sess, f_file_grants_query, self.job.original_filename, self.job.filename, self.is_local, generate_headers=False, generate_string=False) log_data['message'] = 'Finished writing F file CSV: {}'.format( self.job.original_filename) logger.info(log_data)
def generate_f_file(self): """ Write rows from fileF.generate_f_rows to an appropriate CSV. """ log_data = {'message': 'Starting file F generation', 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'submission_id': self.job.submission_id, 'file_type': 'sub_award'} logger.info(log_data) f_file_contracts_query, f_file_grants_query = fileF.generate_f_file_queries(self.job.submission_id) # writing locally first without uploading log_data['message'] = 'Writing F file contracts to CSV: {}'.format(self.job.original_filename) logger.info(log_data) local_f_file = self.job.filename if self.is_local else self.job.original_filename write_query_to_file(self.sess, f_file_contracts_query, local_f_file, generate_headers=True, generate_string=False) # writing locally again but then uploading log_data['message'] = 'Writing F file grants to CSV: {}'.format(self.job.original_filename) logger.info(log_data) write_stream_query(self.sess, f_file_grants_query, self.job.original_filename, self.job.filename, self.is_local, generate_headers=False, generate_string=False) log_data['message'] = 'Finished writing F file CSV: {}'.format(self.job.original_filename) logger.info(log_data)
def export_state_congr_table(sess): """ Export the current state of the state congressional table to a file and upload to the public S3 bucket Args: sess: the database connection """ state_congr_filaname = 'state_congressional.csv' logger.info("Exporting state_congressional table to {}".format( state_congr_filaname)) query = sess.query( StateCongressional.state_code, StateCongressional.congressional_district_no, StateCongressional.census_year).filter( StateCongressional.congressional_district_no.isnot(None)) write_query_to_file(sess, query, state_congr_filaname) logger.info("Uploading {} to {}".format( state_congr_filaname, CONFIG_BROKER["public_files_bucket"])) s3 = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) s3.upload_file('state_congressional.csv', CONFIG_BROKER["public_files_bucket"], 'broker_reference_data/state_congressional.csv') os.remove(state_congr_filaname)
def pull_offices(sess, filename, update_db, pull_all, updated_date_from, export_office, metrics): """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both. Args: sess: Current DB session. filename: Name of the file to be generated with the API data. If None, no file will be created. update_db: Boolean; update the DB tables with the new data from the API. pull_all: Boolean; pull all historical data, instead of just the latest. updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office. export_office: when provided, name of the file to export the office list to metrics: an object containing information for the metrics file """ logger.info('Starting feed: %s', API_URL.replace(CONFIG_BROKER['sam']['api_key'], '[API_KEY]')) top_sub_levels = ['1', '2'] office_levels = ['3', '4', '5', '6', '7'] levels = top_sub_levels + office_levels if filename else office_levels if filename: logger.info('Creating a file ({}) with the data from this pull'.format( filename)) # Write headers to file file_headers = [ 'fhorgid', 'fhorgname', 'fhorgtype', 'description', 'level', 'status', 'region', 'categoryid', 'effectivestartdate', 'effectiveenddate', 'createdby', 'createddate', 'updatedby', 'lastupdateddate', 'fhdeptindagencyorgid', 'fhagencyorgname', 'agencycode', 'oldfpdsofficecode', 'aacofficecode', 'cgaclist_0_cgac', 'cgaclist_1_cgac', 'cgaclist_2_cgac', 'cgaclist_3_cgac', 'cgaclist_4_cgac', 'fhorgofficetypelist_0_officetype', 'fhorgofficetypelist_0_officetypestartdate', 'fhorgofficetypelist_0_officetypeenddate', 'fhorgofficetypelist_1_officetype', 'fhorgofficetypelist_1_officetypestartdate', 'fhorgofficetypelist_1_officetypeenddate', 'fhorgofficetypelist_2_officetype', 'fhorgofficetypelist_2_officetypestartdate', 'fhorgofficetypelist_2_officetypeenddate', 'fhorgofficetypelist_3_officetype', 'fhorgofficetypelist_3_officetypeenddate', 'fhorgofficetypelist_3_officetypestartdate', 'fhorgaddresslist_0_city', 'fhorgaddresslist_0_state', 'fhorgaddresslist_0_country_code', 'fhorgaddresslist_0_addresstype', 'fhorgnamehistory_0_fhorgname', 'fhorgnamehistory_0_effectivedate', 'fhorgparenthistory_0_fhfullparentpathid', 'fhorgparenthistory_0_fhfullparentpathname', 'fhorgparenthistory_0_effectivedate', 'links_0_href', 'links_0_rel', 'links_1_href', 'links_1_rel', 'links_2_href', 'links_2_rel' ] with open(filename, 'w+') as f: csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) csv_writer.writerow(file_headers) empty_pull_count = 0 for level in levels: # Create URL with the level parameter url_with_params = '{}&level={}'.format(API_URL, level) # Add updateddatefrom and status parameters to the URL if not pull_all: url_with_params += '&updateddatefrom={}&status=all'.format( updated_date_from) # Retrieve the total count of expected records for this pull total_expected_records = get_with_exception_hand( url_with_params)['totalrecords'] metrics['level_{}_records'.format(str(level))] = total_expected_records logger.info('{} level-{} record(s) expected'.format( str(total_expected_records), str(level))) if total_expected_records == 0: empty_pull_count += 1 continue limit = 100 entries_processed = 0 while True: async def _fed_hierarchy_async_get(entries_already_processed): response_list = [] loop = asyncio.get_event_loop() futures = [ loop.run_in_executor( None, get_with_exception_hand, '{}&limit={}&offset={}'.format( url_with_params, str(limit), str(entries_already_processed + (start_offset * limit)))) for start_offset in range(REQUESTS_AT_ONCE) ] for response in await asyncio.gather(*futures): response_list.append(response) pass return response_list # End async get requests def # Retrieve limit*REQUESTS_AT_ONCE records from the API loop = asyncio.get_event_loop() full_response = loop.run_until_complete( _fed_hierarchy_async_get(entries_processed)) # Create an object with all the data from the API dataframe = pd.DataFrame() offices = {} inactive_offices = [] start = entries_processed + 1 for response_dict in full_response: # Process the entry if it isn't an error for org in response_dict.get('orglist', []): entries_processed += 1 # Add to the file data structure if filename: row = json_normalize(flatten_json(org)) dataframe = dataframe.append(row) # Don't process the top_sub_levels, but store them in the fed hierarchy export if level in top_sub_levels: continue # Add to the list of DB objects if update_db: # trim incoming values org = trim_nested_obj(org) # If it's inactive, we don't need all that craziness below, we just need to know which code # to delete if org['status'] == 'INACTIVE': inactive_offices.append(org.get('aacofficecode')) continue agency_code = get_normalized_agency_code( org.get('cgaclist', [{ 'cgac': None }])[0]['cgac'], org.get('agencycode')) # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD if agency_code in ['017', '021', '057']: agency_code = '097' if not org.get('aacofficecode') or not org.get( 'agencycode') or not agency_code: # Item from Fed Hierarchy is missing necessary data, ignore it continue # store all the cgacs/subtiers loaded in from this run, to be filtered later metrics['missing_cgacs'].append(agency_code) metrics['missing_subtier_codes'].append( org.get('agencycode')) new_office = Office( office_code=org.get('aacofficecode'), office_name=org.get('fhorgname'), sub_tier_code=org.get('agencycode'), agency_code=agency_code, contract_funding_office=False, contract_awards_office=False, financial_assistance_awards_office=False, financial_assistance_funding_office=False) for off_type in org.get('fhorgofficetypelist', []): office_type = off_type['officetype'].lower( ).replace(' ', '_') if office_type in [ 'contract_funding', 'contract_awards', 'financial_assistance_awards', 'financial_assistance_funding' ]: setattr(new_office, office_type + '_office', True) offices[org.get('aacofficecode')] = new_office if filename and len(dataframe.index) > 0: # Ensure headers are handled correctly for header in list(dataframe.columns.values): if header not in file_headers: file_headers.append(header) logger.info('Headers missing column: %s', header) # Write to file with open(filename, 'a') as f: dataframe.to_csv(f, index=False, header=False, columns=file_headers) if update_db: # combine both lists of offices to determine what offices to delete, only active ones will be re-added office_codes = set(offices.keys()).union(set(inactive_offices)) sess.query(Office).filter( Office.office_code.in_(office_codes)).delete( synchronize_session=False) sess.add_all(offices.values()) logger.info('Processed rows %s-%s', start, entries_processed) if entries_processed == total_expected_records: # Feed has finished break if entries_processed > total_expected_records: # We have somehow retrieved more records than existed at the beginning of the pull logger.error( 'Total expected records: {}, Number of records retrieved: {}' .format(total_expected_records, entries_processed)) sys.exit(2) if update_db: sess.commit() if export_office: logger.info( 'Creating a file ({}) with the data from the database'.format( export_office)) all_offices = sess.query(Office) write_query_to_file(sess, all_offices, export_office, generate_headers=True) if empty_pull_count == len(levels): logger.error('No records retrieved from the Federal Hierarchy API') sys.exit(3) logger.info('Finished')
def generate_d_file(sess, job, agency_code, is_local=True, old_filename=None): """ Write file D1 or D2 to an appropriate CSV. Args: sess: Current database session job: Upload Job agency_code: FREC or CGAC code for generation is_local: True if in local development, False otherwise old_filename: Previous version of filename, in cases where reverting to old file is necessary """ log_data = { 'message_type': 'ValidatorInfo', 'job_id': job.job_id, 'file_type': job.file_type.letter_name, 'agency_code': agency_code, 'start_date': job.start_date, 'end_date': job.end_date } if job.submission_id: log_data['submission_id'] = job.submission_id # find current date and date of last FPDS pull current_date = datetime.now().date() last_update = sess.query(FPDSUpdate).one_or_none() fpds_date = last_update.update_date if last_update else current_date # check if FileRequest already exists with this job_id, if not, create one file_request = sess.query(FileRequest).filter( FileRequest.job_id == job.job_id).one_or_none() if not file_request: file_request = FileRequest(request_date=current_date, job_id=job.job_id, start_date=job.start_date, end_date=job.end_date, agency_code=agency_code, is_cached_file=False, file_type=job.file_type.letter_name) sess.add(file_request) # determine if anything needs to be done at all exists = file_request.is_cached_file if exists and not (job.file_type.letter_name == 'D1' and file_request.request_date < fpds_date): # this is the up-to-date cached version of the generated file # reset the file names on the upload Job log_data[ 'message'] = '{} file has already been generated by this job'.format( job.file_type.letter_name) logger.info(log_data) filepath = CONFIG_BROKER['broker_files'] if is_local else "".join( [str(job.submission_id), "/"]) job.filename = "".join([filepath, old_filename]) job.original_filename = old_filename job.from_cached = False if job.submission_id: # reset the file names on the validation job val_job = sess.query(Job).filter( Job.submission_id == job.submission_id, Job.file_type_id == job.file_type_id, Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).one_or_none() if val_job: val_job.filename = "".join([filepath, old_filename]) val_job.original_filename = old_filename sess.commit() else: # search for potential parent FileRequests parent_file_request = None if not exists: # attempt to retrieve a parent request parent_query = sess.query(FileRequest).\ filter(FileRequest.file_type == job.file_type.letter_name, FileRequest.start_date == job.start_date, FileRequest.end_date == job.end_date, FileRequest.agency_code == agency_code, FileRequest.is_cached_file.is_(True)) # filter D1 FileRequests by the date of the last FPDS pull if job.file_type.letter_name == 'D1': parent_query = parent_query.filter( FileRequest.request_date >= fpds_date) # mark FileRequest with parent job_id parent_file_request = parent_query.one_or_none() file_request.parent_job_id = parent_file_request.job_id if parent_file_request else None sess.commit() if parent_file_request: # parent exists; copy parent data to this job copy_parent_file_request_data(sess, file_request.job, parent_file_request.job, is_local) else: # no cached file, or cached file is out-of-date log_data['message'] = 'Starting file {} generation'.format( job.file_type.letter_name) log_data['file_name'] = job.original_filename logger.info(log_data) # mark this Job as not from-cache, and mark the FileRequest as the cached version (requested today) job.from_cached = False file_request.is_cached_file = True file_request.request_date = current_date sess.commit() # actually generate the file file_utils = fileD1 if job.file_type.letter_name == 'D1' else fileD2 local_file = "".join( [CONFIG_BROKER['d_file_storage_path'], job.original_filename]) headers = [key for key in file_utils.mapping] query_utils = { "file_utils": file_utils, "agency_code": agency_code, "start": job.start_date, "end": job.end_date, "sess": sess } write_query_to_file(local_file, job.filename, headers, job.file_type.letter_name, is_local, d_file_query, query_utils) log_data['message'] = 'Finished writing to file: {}'.format( job.original_filename) logger.info(log_data) log_data['message'] = 'Finished file {} generation'.format( job.file_type.letter_name) logger.info(log_data)
def pull_offices(sess, filename, update_db, pull_all, updated_date_from, export_office, metrics): """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both. Args: sess: Current DB session. filename: Name of the file to be generated with the API data. If None, no file will be created. update_db: Boolean; update the DB tables with the new data from the API. pull_all: Boolean; pull all historical data, instead of just the latest. updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office. export_office: when provided, name of the file to export the office list to metrics: an object containing information for the metrics file """ logger.info( 'Starting feed: %s', API_URL.replace(CONFIG_BROKER['sam']['federal_hierarchy_api_key'], "[API_KEY]")) top_sub_levels = ["1", "2"] office_levels = ["3", "4", "5", "6", "7"] levels = top_sub_levels + office_levels if filename else office_levels if filename: logger.info("Creating a file ({}) with the data from this pull".format( filename)) # Write headers to file file_headers = [ "fhorgid", "fhorgname", "fhorgtype", "description", "level", "status", "region", "categoryid", "effectivestartdate", "effectiveenddate", "createdby", "createddate", "updatedby", "lastupdateddate", "fhdeptindagencyorgid", "fhagencyorgname", "agencycode", "oldfpdsofficecode", "aacofficecode", "cgaclist_0_cgac", "fhorgofficetypelist_0_officetype", "fhorgofficetypelist_0_officetypestartdate", "fhorgofficetypelist_0_officetypeenddate", "fhorgofficetypelist_1_officetype", "fhorgofficetypelist_1_officetypestartdate", "fhorgofficetypelist_1_officetypeenddate", "fhorgofficetypelist_2_officetype", "fhorgofficetypelist_2_officetypestartdate", "fhorgofficetypelist_2_officetypeenddate", "fhorgaddresslist_0_city", "fhorgaddresslist_0_state", "fhorgaddresslist_0_country_code", "fhorgaddresslist_0_addresstype", "fhorgnamehistory_0_fhorgname", "fhorgnamehistory_0_effectivedate", "fhorgparenthistory_0_fhfullparentpathid", "fhorgparenthistory_0_fhfullparentpathname", "fhorgparenthistory_0_effectivedate", "links_0_href", "links_0_rel", "links_1_href", "links_1_rel", "links_2_href", "links_2_rel" ] with open(filename, 'w+') as f: csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) csv_writer.writerow(file_headers) empty_pull_count = 0 for level in levels: # Create URL with the level parameter url_with_params = "{}&level={}".format(API_URL, level) # Add updateddatefrom parameter to the URL if not pull_all: url_with_params += "&updateddatefrom={}".format(updated_date_from) # Retrieve the total count of expected records for this pull total_expected_records = json.loads( requests.get(url_with_params, timeout=60).text)['totalrecords'] metrics['level_{}_records'.format(str(level))] = total_expected_records logger.info('{} level-{} record(s) expected'.format( str(total_expected_records), str(level))) if total_expected_records == 0: empty_pull_count += 1 continue limit = 100 entries_processed = 0 while True: async def _fed_hierarchy_async_get(entries_already_processed): response_list = [] loop = asyncio.get_event_loop() futures = [ loop.run_in_executor( None, get_with_exception_hand, "{}&limit={}&offset={}".format( url_with_params, str(limit), str(entries_already_processed + (start_offset * limit)))) for start_offset in range(REQUESTS_AT_ONCE) ] for response in await asyncio.gather(*futures): response_list.append(response.text) pass return response_list # End async get requests def # Retrieve limit*REQUESTS_AT_ONCE records from the API loop = asyncio.get_event_loop() full_response = loop.run_until_complete( _fed_hierarchy_async_get(entries_processed)) # Create an object with all the data from the API dataframe = pd.DataFrame() offices = {} start = entries_processed + 1 for next_resp in full_response: response_dict = json.loads(next_resp) # We get errors back as regular JSON, need to catch them somewhere if response_dict.get('error'): err = response_dict.get('error') logger.error( "An error of type {} occurred. Message: {}".format( err['code'], err['message'])) sys.exit(2) # Process the entry if it isn't an error for org in response_dict.get('orglist', []): entries_processed += 1 # Add to the file data structure if filename: row = json_normalize(flatten_json(org)) dataframe = dataframe.append(row) # Don't process the top_sub_levels, but store them in the fed hierarchy export if level in top_sub_levels: continue # Add to the list of DB objects if update_db: agency_code = get_normalized_agency_code( org.get('cgaclist', [{ 'cgac': None }])[0]['cgac'], org.get('agencycode')) # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD if agency_code in ['017', '021', '057']: agency_code = '097' if not org.get('aacofficecode') or not org.get( 'agencycode') or not agency_code: # Item from Fed Hierarchy is missing necessary data, ignore it continue new_office = Office( office_code=org.get('aacofficecode'), office_name=org.get('fhorgname'), sub_tier_code=org.get('agencycode'), agency_code=agency_code, contract_funding_office=False, contract_awards_office=False, financial_assistance_awards_office=False, financial_assistance_funding_office=False) for off_type in org.get('fhorgofficetypelist', []): office_type = off_type['officetype'].lower( ).replace(" ", "_") if office_type in [ 'contract_funding', 'contract_awards', 'financial_assistance_awards', 'financial_assistance_funding' ]: setattr(new_office, office_type + '_office', True) offices[org.get('aacofficecode')] = new_office if filename and len(dataframe.index) > 0: # Ensure headers are handled correctly for header in list(dataframe.columns.values): if header not in file_headers: file_headers.append(header) logger.info("Headers missing column: %s", header) # Write to file with open(filename, 'a') as f: dataframe.to_csv(f, index=False, header=False, columns=file_headers) if update_db: office_codes = set(offices.keys()) sess.query(Office).filter( Office.office_code.in_(office_codes)).delete( synchronize_session=False) sess.add_all(offices.values()) logger.info("Processed rows %s-%s", start, entries_processed) if entries_processed == total_expected_records: # Feed has finished break if entries_processed > total_expected_records: # We have somehow retrieved more records than existed at the beginning of the pull logger.error( "Total expected records: {}, Number of records retrieved: {}" .format(total_expected_records, entries_processed)) sys.exit(2) if update_db: sess.commit() if export_office: logger.info( "Creating a file ({}) with the data from the database".format( export_office)) all_offices = sess.query(Office) write_query_to_file(sess, all_offices, export_office, generate_headers=True) if empty_pull_count == len(levels): logger.error("No records retrieved from the Federal Hierarchy API") sys.exit(3) logger.info("Finished")
def generate_d_file(file_type, agency_code, start, end, job_id, upload_name, is_local, submission_id=None): """Write file D1 or D2 to an appropriate CSV. Args: file_type - File type as either "D1" or "D2" agency_code - FREC or CGAC code for generation start - Beginning of period for D file end - End of period for D file job_id - Job ID for upload job upload_name - File key to use on S3 is_local - True if in local development, False otherwise """ log_data = { 'message_type': 'BrokerInfo', 'job_id': job_id, 'file_type': FILE_TYPE_DICT_LETTER_NAME[file_type], 'agency_code': agency_code, 'start_date': start, 'end_date': end } if submission_id: log_data['submission_id'] = submission_id with job_context(job_id, is_local) as sess: current_date = datetime.now().date() # check if FileRequest already exists with this job_id, if not, create one file_request = sess.query(FileRequest).filter( FileRequest.job_id == job_id).one_or_none() if not file_request: file_request = FileRequest(request_date=current_date, job_id=job_id, start_date=start, end_date=end, agency_code=agency_code, file_type=file_type, is_cached_file=False) sess.add(file_request) # search for potential parent FileRequests parent_file_request = None if not file_request.is_cached_file: parent_request_query = sess.query(FileRequest).\ filter(FileRequest.file_type == file_type, FileRequest.start_date == start, FileRequest.end_date == end, FileRequest.agency_code == agency_code, FileRequest.is_cached_file.is_(True)) # filter D1 FileRequests by the date of the last FPDS pull if file_type == 'D1': last_update = sess.query(FPDSUpdate).one_or_none() fpds_date = last_update.update_date if last_update else current_date parent_request_query = parent_request_query.filter( FileRequest.request_date >= fpds_date) # mark FileRequest with parent job_id parent_file_request = parent_request_query.one_or_none() file_request.parent_job_id = parent_file_request.job_id if parent_file_request else None sess.commit() if file_request.is_cached_file: # this is the cached file, no need to do anything log_data[ 'message'] = '{} file has already been generated by this job'.format( file_type) logger.info(log_data) elif parent_file_request: # copy parent data to this job if parent is not still running if parent_file_request.job.job_status_id != JOB_STATUS_DICT[ 'running']: copy_parent_file_request_data(sess, file_request.job, parent_file_request.job, file_type, is_local) else: # no cached file file_name = upload_name.split('/')[-1] log_data['message'] = 'Starting file {} generation'.format( file_type) log_data['file_name'] = file_name logger.info(log_data) file_utils = fileD1 if file_type == 'D1' else fileD2 local_filename = "".join( [CONFIG_BROKER['d_file_storage_path'], file_name]) headers = [key for key in file_utils.mapping] # actually generate the file query_utils = { "file_utils": file_utils, "agency_code": agency_code, "start": start, "end": end, "sess": sess } write_query_to_file(local_filename, upload_name, headers, file_type, is_local, d_file_query, query_utils) # mark this FileRequest as the cached version file_request.is_cached_file = True sess.commit() log_data['message'] = 'Finished writing to file: {}'.format( file_name) logger.info(log_data) log_data['message'] = 'Finished file {} generation'.format(file_type) logger.info(log_data)
def pull_offices(sess, filename, update_db, pull_all, updated_date_from, export_office): """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both. Args: sess: Current DB session. filename: Name of the file to be generated with the API data. If None, no file will be created. update_db: Boolean; update the DB tables with the new data from the API. pull_all: Boolean; pull all historical data, instead of just the latest. updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office. """ logger.info('Starting feed: %s', API_URL.replace(CONFIG_BROKER['sam']['federal_hierarchy_api_key'], "[API_KEY]")) top_sub_levels = ["1", "2"] office_levels = ["3", "4", "5", "6", "7"] levels = top_sub_levels + office_levels if filename else office_levels if filename: logger.info("Creating a file ({}) with the data from this pull".format(filename)) # Write headers to file file_headers = [ "fhorgid", "fhorgname", "fhorgtype", "description", "level", "status", "region", "categoryid", "effectivestartdate", "effectiveenddate", "createdby", "createddate", "updatedby", "lastupdateddate", "fhdeptindagencyorgid", "fhagencyorgname", "agencycode", "oldfpdsofficecode", "aacofficecode", "cgaclist_0_cgac", "fhorgofficetypelist_0_officetype", "fhorgofficetypelist_0_officetypestartdate", "fhorgofficetypelist_0_officetypeenddate", "fhorgofficetypelist_1_officetype", "fhorgofficetypelist_1_officetypestartdate", "fhorgofficetypelist_1_officetypeenddate", "fhorgofficetypelist_2_officetype", "fhorgofficetypelist_2_officetypestartdate", "fhorgofficetypelist_2_officetypeenddate", "fhorgaddresslist_0_city", "fhorgaddresslist_0_state", "fhorgaddresslist_0_country_code", "fhorgaddresslist_0_addresstype", "fhorgnamehistory_0_fhorgname", "fhorgnamehistory_0_effectivedate", "fhorgparenthistory_0_fhfullparentpathid", "fhorgparenthistory_0_fhfullparentpathname", "fhorgparenthistory_0_effectivedate", "links_0_href", "links_0_rel", "links_1_href", "links_1_rel", "links_2_href", "links_2_rel"] with open(filename, 'w+') as f: csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) csv_writer.writerow(file_headers) empty_pull_count = 0 for level in levels: # Create URL with the level parameter url_with_params = "{}&level={}".format(API_URL, level) # Add updateddatefrom parameter to the URL if not pull_all: url_with_params += "&updateddatefrom={}".format(updated_date_from) # Retrieve the total count of expected records for this pull total_expected_records = json.loads(requests.get(url_with_params, timeout=60).text)['totalrecords'] logger.info('{} level-{} record(s) expected'.format(str(total_expected_records), str(level))) if total_expected_records == 0: empty_pull_count += 1 continue limit = 100 entries_processed = 0 while True: async def _fed_hierarchy_async_get(entries_already_processed): response_list = [] loop = asyncio.get_event_loop() futures = [ loop.run_in_executor( None, get_with_exception_hand, "{}&limit={}&offset={}".format(url_with_params, str(limit), str(entries_already_processed + (start_offset * limit))) ) for start_offset in range(REQUESTS_AT_ONCE) ] for response in await asyncio.gather(*futures): response_list.append(response.text) pass return response_list # End async get requests def # Retrieve limit*REQUESTS_AT_ONCE records from the API loop = asyncio.get_event_loop() full_response = loop.run_until_complete(_fed_hierarchy_async_get(entries_processed)) # Create an object with all the data from the API dataframe = pd.DataFrame() offices = {} start = entries_processed + 1 for next_resp in full_response: response_dict = json.loads(next_resp) for org in response_dict.get('orglist', []): entries_processed += 1 # Add to the file data structure if filename: row = json_normalize(flatten_json(org)) dataframe = dataframe.append(row) # Don't process the top_sub_levels, but store them in the fed hierarchy export if level in top_sub_levels: continue # Add to the list of DB objects if update_db: agency_code = get_normalized_agency_code(org.get('cgaclist', [{'cgac': None}])[0]['cgac'], org.get('agencycode')) # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD if agency_code in ['017', '021', '057']: agency_code = '097' if not org.get('aacofficecode') or not org.get('agencycode') or not agency_code: # Item from Fed Hierarchy is missing necessary data, ignore it continue new_office = Office(office_code=org.get('aacofficecode'), office_name=org.get('fhorgname'), sub_tier_code=org.get('agencycode'), agency_code=agency_code, funding_office=False, contracting_office=False, grant_office=False) for off_type in org.get('fhorgofficetypelist', []): office_type = off_type['officetype'].lower() if office_type == 'financial assistance': office_type = 'grant' if office_type in ['contracting', 'funding', 'grant']: setattr(new_office, office_type + '_office', True) offices[org.get('aacofficecode')] = new_office if filename and len(dataframe.index) > 0: # Ensure headers are handled correctly for header in list(dataframe.columns.values): if header not in file_headers: file_headers.append(header) logger.info("Headers missing column: %s", header) # Write to file with open(filename, 'a') as f: dataframe.to_csv(f, index=False, header=False, columns=file_headers) if update_db: office_codes = set(offices.keys()) sess.query(Office).filter(Office.office_code.in_(office_codes)).delete(synchronize_session=False) sess.add_all(offices.values()) logger.info("Processed rows %s-%s", start, entries_processed) if entries_processed == total_expected_records: # Feed has finished break if entries_processed > total_expected_records: # We have somehow retrieved more records than existed at the beginning of the pull logger.error("Total expected records: {}, Number of records retrieved: {}".format( total_expected_records, entries_processed)) sys.exit(2) if update_db: sess.commit() if export_office: logger.info("Creating a file ({}) with the data from the database".format(export_office)) all_offices = sess.query(Office) write_query_to_file(sess, all_offices, export_office, generate_headers=True) if empty_pull_count == len(levels): logger.error("No records retrieved from the Federal Hierarchy API") sys.exit(3) logger.info("Finished")