def setUpClass(cls): """Set up resources to be shared within a test class""" # TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases configure_logging() suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() app = create_app() app.config['TESTING'] = True app.config['DEBUG'] = False cls.app = TestApp(app) # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db setup_validation_db() cls.userId = None # constants to use for default submission start and end dates cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1) cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
# insert status types for s in lookups.JOB_STATUS: status = JobStatus(job_status_id=s.id, name=s.name, description=s.desc) sess.merge(status) # insert job types for t in lookups.JOB_TYPE: this_type = JobType(job_type_id=t.id, name=t.name, description=t.desc) sess.merge(this_type) # insert publish status for ps in lookups.PUBLISH_STATUS: status = PublishStatus(publish_status_id=ps.id, name=ps.name, description=ps.desc) sess.merge(status) # insert file types for ft in lookups.FILE_TYPE: file_type = FileType( file_type_id=ft.id, name=ft.name, description=ft.desc, letter_name=ft.letter, file_order=ft.order ) sess.merge(file_type) if __name__ == '__main__': configure_logging() setup_job_tracker_db()
'award_procurement': { 'staging_table': AwardProcurement, 'certified_table': CertifiedAwardProcurement, 'staging_id': 'award_procurement_id', 'certified_id': 'certified_award_procurement_id', 'file_type_id': FILE_TYPE_DICT['award_procurement'] }, 'award_financial_assistance': { 'staging_table': AwardFinancialAssistance, 'certified_table': CertifiedAwardFinancialAssistance, 'staging_id': 'award_financial_assistance_id', 'certified_id': 'certified_award_financial_assistance_id', 'file_type_id': FILE_TYPE_DICT['award'] } } for award_type, award_dict in aw_data_map.items(): copy_certified_submission_award_data(award_dict['staging_table'], award_dict['certified_table'], award_dict['staging_id']) load_updated_award_data( award_dict['staging_table'], award_dict['certified_table'], award_dict['file_type_id'], shared_internal_cols + [award_dict['certified_id']]) if __name__ == '__main__': configure_logging() with create_app().app_context(): main()
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10 ** decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() # If we've updated the data at all, update the external data load date update_external_data_load_date(local_now, datetime.now(), 'cfda') if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_cfda_program(base_path): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "cfda_program.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "cfda_program.csv") logger.info('Loading CFDA program file: ' + "cfda_program.csv") """Load country code lookup table.""" model = CFDAProgram with create_app().app_context(): configure_logging() sess = GlobalDB.db().session # for object class, delete and replace values sess.query(model).delete() data = pd.read_csv(filename, dtype=str, encoding='latin1') data = clean_data( data, model, { "program_title": "program_title", "program_number": "program_number", "popular_name_(020)": "popular_name", "federal_agency_(030)": "federal_agency", "authorization_(040)": "authorization", "objectives_(050)": "objectives", "types_of_assistance_(060)": "types_of_assistance", "uses_and_use_restrictions_(070)": "uses_and_use_restrictions", "applicant_eligibility_(081)": "applicant_eligibility", "beneficiary_eligibility_(082)": "beneficiary_eligibility", "credentials/documentation_(083)": "credentials_documentation", "preapplication_coordination_(091)": "preapplication_coordination", "application_procedures_(092)": "application_procedures", "award_procedure_(093)": "award_procedure", "deadlines_(094)": "deadlines", "range_of_approval/disapproval_time_(095)": "range_of_approval_disapproval_time", "appeals_(096)": "appeals", "renewals_(097)": "renewals", "formula_and_matching_requirements_(101)": "formula_and_matching_requirements", "length_and_time_phasing_of_assistance_(102)": "length_and_time_phasing_of_assistance", "reports_(111)": "reports", "audits_(112)": "audits", "records_(113)": "records", "account_identification_(121)": "account_identification", "obligations_(122)": "obligations", "range_and_average_of_financial_assistance_(123)": "range_and_average_of_financial_assistance", "program_accomplishments_(130)": "program_accomplishments", "regulations__guidelines__and_literature_(140)": "regulations_guidelines_and_literature", "regional_or__local_office_(151)": "regional_or_local_office", "headquarters_office_(152)": "headquarters_office", "website_address_(153)": "website_address", "related_programs_(160)": "related_programs", "examples_of_funded_projects_(170)": "examples_of_funded_projects", "criteria_for_selecting_proposals_(180)": "criteria_for_selecting_proposals", "url": "url", "recovery": "recovery", "omb_agency_code": "omb_agency_code", "omb_bureau_code": "omb_bureau_code", "published_date": "published_date", "archived_date": "archived_date" }, {}) data["published_date"] = format_date(data["published_date"]) data["archived_date"] = format_date(data["archived_date"]) # insert to db table_name = model.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def setUpClass(cls): """Set up resources to be shared within a test class""" # TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases configure_logging() suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() app = create_app() app.config['TESTING'] = True app.config['DEBUG'] = False cls.app = TestApp(app) sess = GlobalDB.db().session # set up default e-mails for tests test_users = { 'admin_user': '******', 'agency_user': '******', 'agency_user_2': '*****@*****.**', 'no_permissions_user': '******', 'editfabs_user': '******' } admin_password = '******' cgac = CGAC(cgac_code='000', agency_name='Example Agency') sess.add(cgac) sess.commit() # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db setup_validation_db() # setup Schema SchemaLoader.load_all_from_path(validator_config_path) load_sql_rules() create_user_with_password(test_users["admin_user"], admin_password, Bcrypt(), website_admin=True) cls.userId = None cls.test_users = test_users # constants to use for default submission start and end dates cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1) cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) """Load country code lookup table.""" model = CFDAProgram def fix_program_number(n, decimals=3): multiplier = 10 ** decimals value = math.floor(n * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session now = datetime.utcnow() import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) import_dataframe = import_data.copy(deep=True) # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1 import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now) table_name = model.__table__.name current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False) # Now we need to overwrite the db's audit dates in the created dataframe, and # also set all the pks to 1, so they match current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now) # pandas comparison requires everything to be in the same order current_data.sort_values('program_number', inplace=True) import_dataframe.sort_values('program_number', inplace=True) # columns too cols = import_dataframe.columns.tolist() cols.sort() import_dataframe = import_dataframe[cols] cols = current_data.columns.tolist() cols.sort() current_data = current_data[cols] # need to reset the indexes now that we've done all this sorting, so that they match import_dataframe.reset_index(drop=True, inplace=True) current_data.reset_index(drop=True, inplace=True) # My favorite part: When pandas pulls the data out of postgres, the program_number column # is a Decimal. However, in adding it to the dataframe, this column loses precision. # So for example, a program number of 10.001 imports into the dataframe as 10.000999999999999. # It also needs to be cast to astring, and padded with the right number of zeroes, as needed. current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x)) # Finally, you can execute this and get True back if the data truly has not changed from the last # time the CSV was loaded. new_data = not import_dataframe.equals(current_data) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) else: logger.info("Skipped cfda load, no new data.") sys.exit(3)
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) """Load country code lookup table.""" model = CFDAProgram def fix_program_number(n, decimals=3): multiplier = 10**decimals value = math.floor(n * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session now = datetime.utcnow() import_data = pd.read_csv(filename, dtype=str, encoding='latin1', na_filter=False) import_data = clean_data(import_data, model, DATA_CLEANING_MAP, {}) import_data["published_date"] = format_date( import_data["published_date"]) import_data["archived_date"] = format_date( import_data["archived_date"]) import_dataframe = import_data.copy(deep=True) # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1 import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now) table_name = model.__table__.name current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False) # Now we need to overwrite the db's audit dates in the created dataframe, and # also set all the pks to 1, so they match current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now) # pandas comparison requires everything to be in the same order current_data.sort_values('program_number', inplace=True) import_dataframe.sort_values('program_number', inplace=True) # columns too cols = import_dataframe.columns.tolist() cols.sort() import_dataframe = import_dataframe[cols] cols = current_data.columns.tolist() cols.sort() current_data = current_data[cols] # need to reset the indexes now that we've done all this sorting, so that they match import_dataframe.reset_index(drop=True, inplace=True) current_data.reset_index(drop=True, inplace=True) # My favorite part: When pandas pulls the data out of postgres, the program_number column # is a Decimal. However, in adding it to the dataframe, this column loses precision. # So for example, a program number of 10.001 imports into the dataframe as 10.000999999999999. # It also needs to be cast to astring, and padded with the right number of zeroes, as needed. current_data['program_number'] = current_data['program_number'].apply( lambda x: fix_program_number(x)) # Finally, you can execute this and get True back if the data truly has not changed from the last # time the CSV was loaded. new_data = not import_dataframe.equals(current_data) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) else: logger.info("Skipped cfda load, no new data.") sys.exit(3)