def retrieve_cached_file_generation(job, agency_type, agency_code): """ Retrieves a cached FileGeneration for the D file request, if there is one. Args: job: Upload Job for the generation file agency_type: Type of Agency to generate files by: "awarding" or "funding" agency_code: Agency code to generate file for Returns: FileGeneration object matching the criteria, or None """ sess = GlobalDB.db().session logger.info({'message': 'Checking for a cached FileGeneration to pull file from', 'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': job.file_type.letter_name}) # find current date and date of last FPDS pull current_date = datetime.now().date() last_update = sess.query(FPDSUpdate).one_or_none() fpds_date = last_update.update_date if last_update else current_date # check if a cached FileGeneration already exists using these criteria file_generation = None file_gen = sess.query(FileGeneration).filter( FileGeneration.start_date == job.start_date, FileGeneration.end_date == job.end_date, FileGeneration.agency_code == agency_code, FileGeneration.agency_type == agency_type, FileGeneration.file_type == job.file_type.letter_name, FileGeneration.is_cached_file.is_(True)).one_or_none() if file_gen and (file_gen.file_type == 'D1' and file_gen.request_date < fpds_date): # Uncache expired D1 FileGeneration file_gen.is_cached_file = False sess.commit() elif file_gen: file_generation = file_gen return file_generation
def check_generation_prereqs(submission_id, file_type): """ Make sure the prerequisite jobs for this file type are complete without errors. Args: submission_id: the submission id for which we're checking file generation prerequisites file_type: the type of file being generated Returns: A boolean indicating if the job has no incomplete prerequisites (True if the job is clear to start) """ sess = GlobalDB.db().session prereq_query = sess.query(Job).filter(Job.submission_id == submission_id, or_(Job.job_status_id != lookups.JOB_STATUS_DICT['finished'], Job.number_of_errors > 0)) # Check cross-file validation if generating E or F if file_type in ['E', 'F']: unfinished_prereqs = prereq_query.filter(Job.job_type_id == lookups.JOB_TYPE_DICT['validation']).count() # Check A, B, C files if generating a D file elif file_type in ['D1', 'D2']: unfinished_prereqs = prereq_query.filter(Job.file_type_id.in_( [lookups.FILE_TYPE_DICT['appropriations'], lookups.FILE_TYPE_DICT['program_activity'], lookups.FILE_TYPE_DICT['award_financial']])).count() else: raise ResponseException('Invalid type for file generation', StatusCode.CLIENT_ERROR) return unfinished_prereqs == 0
def create_generation_job(file_type_name, start_date, end_date): """ Add details to jobs for generating files Args: file_type_name: the name of the file type being generated job: the generation job, None if it is a detached generation start_date: The start date for the generation job, only used for detached files end_date: The end date for the generation job, only used for detached files Returns: the file generation job """ sess = GlobalDB.db().session # Create a new job for a detached generation job = Job(job_type_id=lookups.JOB_TYPE_DICT['file_upload'], user_id=g.user.user_id, file_type_id=lookups.FILE_TYPE_DICT[file_type_name], start_date=start_date, end_date=end_date) sess.add(job) # Update the job details job.message = None job.job_status_id = lookups.JOB_STATUS_DICT["ready"] sess.commit() sess.refresh(job) return job
def set_skip_guide(self): """ Set current user's skip guide parameter Returns: JsonResponse object containing results of setting the skip guide or details of the error that occurred. Possible errors include the request not containing a skip_guide parameter or it not being a boolean value """ sess = GlobalDB.db().session request_dict = RequestDictionary.derive(self.request) try: if 'skip_guide' not in request_dict: raise ResponseException( "Must include skip_guide parameter", StatusCode.CLIENT_ERROR ) skip_guide = str(request_dict['skip_guide']).lower() if skip_guide not in ("true", "false"): raise ResponseException( "skip_guide must be true or false", StatusCode.CLIENT_ERROR ) g.user.skip_guide = skip_guide == "true" except ResponseException as exc: return JsonResponse.error(exc, exc.status) sess.commit() return JsonResponse.create(StatusCode.OK, {"message": "skip_guide set successfully", "skip_guide": skip_guide})
def main(): sess = GlobalDB.db().session # delete any data in the PublishedAwardFinancialAssistance table logger.info('deleting PublishedAwardFinancialAssistance data') sess.query(PublishedAwardFinancialAssistance).delete(synchronize_session=False) sess.commit() if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['archive_bucket']) for key in s3bucket.list(): if re.match('^\d{4}_All_(DirectPayments|Grants|Insurance|Loans|Other)_Full_\d{8}.csv.zip', key.name): file_path = key.generate_url(expires_in=600) parse_fabs_file(urllib.request.urlopen(file_path), sess) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "fabs") file_list = [f for f in os.listdir(base_path)] for file in file_list: if re.match('^\d{4}_All_(DirectPayments|Grants|Insurance|Loans|Other)_Full_\d{8}.csv.zip', file): parse_fabs_file(open(os.path.join(base_path, file)), sess) set_active_rows(sess) logger.info("Historical FABS script complete")
def read_zips(): with create_app().app_context(): sess = GlobalDB.db().session # delete old values in case something changed and one is now invalid sess.query(Zips).delete(synchronize_session=False) sess.commit() if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket']) zip_folder = CONFIG_BROKER["zip_folder"] + "/" for key in s3bucket.list(prefix=zip_folder): if key.name != zip_folder: zip_4_file_path = key.generate_url(expires_in=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3bucket.get_key("ctystate.txt").generate_url(expires_in=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [f for f in os.listdir(base_path) if not re.match('^\.', f)] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) logger.info("Zipcode script complete")
def load_frec(file_name): """ Load FREC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read """ sess = GlobalDB.db().session models = {frec.frec_code: frec for frec in sess.query(FREC)} # read FREC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, FREC, {"frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"frec": {"keep_null": False}, "cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}} ) # de-dupe data.drop_duplicates(subset=['frec_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()} # insert to db delete_missing_frecs(models, data) update_frecs(models, data, cgac_dict) sess.add_all(models.values()) sess.commit() logger.info('%s FREC records inserted', len(models))
def setUpClass(cls): """Set up class-wide resources like submissions and jobs.""" super(UserTests, cls).setUpClass() with create_app().app_context(): sess = GlobalDB.db().session # Add submissions for agency user sess.query(Submission).filter(Submission.user_id == cls.agency_user_id).delete() sess.commit() for i in range(0, 6): sub = Submission(user_id=cls.agency_user_id) sub.reporting_start_date = datetime(2015, 10, 1) sub.reporting_end_date = datetime(2015, 12, 31) sub.cgac_code = cls.admin_cgac_code sess.add(sub) sess.commit() if i == 0: cls.submission_id = sub.submission_id # Add job to first submission job = Job( submission_id=cls.submission_id, job_status_id=JOB_STATUS_DICT['running'], job_type_id=JOB_TYPE_DICT['file_upload'], file_type_id=FILE_TYPE_DICT['appropriations'] ) sess.add(job) sess.commit() cls.uploadId = job.job_id
def setup_emails(): """Create email templates from model metadata.""" with create_app().app_context(): sess = GlobalDB.db().session # insert email template types type_list = [ ('review_submission', '') ] for t in type_list: email_id = sess.query( EmailTemplateType.email_template_type_id).filter( EmailTemplateType.name == t[0]).one_or_none() if not email_id: email_type = EmailTemplateType(name=t[0], description=t[1]) sess.add(email_type) sess.commit() # insert email templates # Submission Review template = ("[REV_USER_NAME] has shared a DATA Act broker submission with you from [REV_AGENCY]. Click " "<a href='[REV_URL]'>here</a> to review their submission. For questions or comments, please visit " "the Service Desk at https://servicedesk.usaspending.gov/ or e-mail [email protected].") load_email_template(sess, "DATA Act Broker - Submission Ready for Review", template, "review_submission")
def update_offices(csv_path): """ Load office data from the provided CSV and replace/insert any office lookups. Args: csv_path: path/url to the file to read from """ sess = GlobalDB.db().session data = clean_office(csv_path) add_existing_id(data) old_data = data[data['existing_id'].notnull()] del old_data['existing_id'] new_data = data[data['existing_id'].isnull()] del new_data['existing_id'] # instead of using the pandas to_sql dataframe method like some of the other domain load processes, iterate through # the dataframe rows so we can load using the orm model (note: toyed with the SQLAlchemy bulk load options but # ultimately decided not to go outside the unit of work for the sake of a performance gain) for _, row in old_data.iterrows(): sess.query(FPDSContractingOffice).filter_by(contracting_office_code=row['contracting_office_code'])\ .update(row, synchronize_session=False) for _, row in new_data.iterrows(): sess.add(FPDSContractingOffice(**row)) sess.commit() logger.info('%s records in CSV, %s existing', len(data.index), sum(data['existing_id'].notnull()))
def submission_procurements(submission_id): """Fetch procurements and subcontracts""" sess = GlobalDB.db().session logger.debug('Starting submission procurements') award_proc_sub = sess.query(AwardProcurement.piid, AwardProcurement.parent_award_id, AwardProcurement.naics_description, AwardProcurement.awarding_sub_tier_agency_c, AwardProcurement.submission_id).\ filter(AwardProcurement.submission_id == submission_id).distinct().cte("award_proc_sub") results = sess.query(award_proc_sub, FSRSProcurement, FSRSSubcontract).\ filter(FSRSProcurement.contract_number == award_proc_sub.c.piid).\ filter(FSRSProcurement.idv_reference_number.isnot_distinct_from(award_proc_sub.c.parent_award_id)).\ filter(FSRSProcurement.contracting_office_aid == award_proc_sub.c.awarding_sub_tier_agency_c).\ filter(FSRSSubcontract.parent_id == FSRSProcurement.id) # The cte returns a set of columns, not an AwardProcurement object, so we have to unpack each column for award_piid, award_parent_id, award_naics_desc, award_sub_tier, award_sub_id, proc, sub in results: # need to combine those columns again here so we can get a proper ModelRow award = AwardProcurement(piid=award_piid, parent_award_id=award_parent_id, naics_description=award_naics_desc, awarding_sub_tier_agency_c=award_sub_tier, submission_id=award_sub_id) yield ModelRow(award, proc, sub, naics_desc=award.naics_description) logger.debug('Finished submission procurements')
def main(): sess = GlobalDB.db().session start = datetime.datetime.now() logger.info("FPDS IDV delete started") # get and read the file del_file = get_delete_file() data = pd.read_csv(del_file, dtype=str, encoding='utf_8_sig') # Clean up the data so it's usable data = clean_delete_data(data) # Gather list of records to delete gather_start = datetime.datetime.now() logger.info("Starting gathering of records to delete.") delete_list, delete_dict = get_deletes(sess, data) gather_end = datetime.datetime.now() logger.info("Finished gathering records in {} seconds. Total records to delete: {}".format(gather_end-gather_start, len(delete_list))) # Delete records logger.info("Deleting records") delete_records(sess, delete_list, delete_dict) sess.commit() end = datetime.datetime.now() logger.info("FPDS IDV delete finished in %s seconds", end-start)
def insert_from_table(file_type_id, submission_id): """ Insert the data from the base staging table into the corresponding Certified table. Params: file_type_id: Database file type ID for files A, B, or C submission_id: Database ID for the submission being loaded """ sess = GlobalDB.db().session logger.info('Copying submission {} data from base table into {}'.format(submission_id, FTI_TABLENAME_DICT[file_type_id])) table_type = FTI_BASETABLE_DICT[file_type_id].__table__.name column_list = [col.key for col in FTI_BASETABLE_DICT[file_type_id].__table__.columns] column_list.remove('created_at') column_list.remove('updated_at') column_list.remove(table_type + '_id') col_string = ", ".join(column_list) executed = sess.execute( "INSERT INTO certified_{} (created_at, updated_at, {}) " "SELECT NOW() AS created_at, NOW() AS updated_at, {} " "FROM {} " "WHERE submission_id={}".format(table_type, col_string, col_string, table_type, submission_id)) sess.commit() logger.info('Loaded {} records into the {} table'.format(executed.rowcount, FTI_TABLENAME_DICT[file_type_id]))
def list_all_agencies(): sess = GlobalDB.db().session cgacs = sess.query(CGAC).all() agency_list = [{'agency_name': cgac.agency_name, 'cgac_code': cgac.cgac_code} for cgac in cgacs] frecs = sess.query(FREC).all() shared_list = [{'agency_name': frec.agency_name, 'frec_code': frec.frec_code} for frec in frecs] return JsonResponse.create(StatusCode.OK, {'agency_list': agency_list, 'shared_agency_list': shared_list})
def start_a_generation(job, start_date, end_date, agency_code): """ Validates the start and end dates of the generation and sends the job information to SQS. Args: job: File generation job to start start_date: String to parse as the start date of the generation end_date: String to parse as the end date of the generation agency_code: Agency code for A file generations """ if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY", StatusCode.CLIENT_ERROR) # Update the Job's start and end dates sess = GlobalDB.db().session job.start_date = start_date job.end_date = end_date sess.commit() mark_job_status(job.job_id, "waiting") file_type = job.file_type.letter_name log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': file_type} logger.info(log_data) # Set SQS message attributes message_attr = {'agency_code': {'DataType': 'String', 'StringValue': agency_code}} # Add job_id to the SQS job queue queue = sqs_queue() msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr) log_data['message'] = 'SQS message response: {}'.format(msg_response) logger.debug(log_data)
def write_file_error(job_id, filename, error_type, extra_info=None): """ Write a file-level error to the file table Args: job_id: ID of job in job tracker filename: name of error report in S3 error_type: type of error, value will be mapped to ValidationError class extra_info: list of extra information to be included in file """ sess = GlobalDB.db().session try: int(job_id) except: raise ValueError("".join(["Bad jobId: ", str(job_id)])) # Get File object for this job ID or create it if it doesn't exist file_rec = create_file_if_needed(job_id, filename) # Mark error type and add header info if present file_rec.file_status_id = FILE_STATUS_DICT[ValidationError.get_error_type_string(error_type)] if extra_info is not None: if "missing_headers" in extra_info: file_rec.headers_missing = extra_info["missing_headers"] if "duplicated_headers" in extra_info: file_rec.headers_duplicated = extra_info["duplicated_headers"] sess.add(file_rec) sess.commit()
def main(): parser = argparse.ArgumentParser(description='Update contract transaction and/or fabs rows based on updates to the ' 'agency list') parser.add_argument('-a', '--missing_agency', help='Perform an update on 999 agency codes', action='store_true', required=False, default=False) parser.add_argument('-s', '--subtier_code', help='Select specific subtier to update. Must be a 4-digit code', type=str, required=False) parser.add_argument('-t', '--tables', help='Which tables (fabs, fpds, or both) to update. Defaults to both.', required=False, default='both', choices=['fabs', 'fpds', 'both']) args = parser.parse_args() if not args.subtier_code and not args.missing_agency: logger.error('Missing either subtier_code or missing_agency argument') elif args.subtier_code and len(args.subtier_code) != 4: logger.error('Subtier not a correct format, must be 4 digits') else: sess = GlobalDB.db().session if args.tables in ('fpds', 'both'): update_table(sess, 'awarding', 'detached_award_procurement', args) update_table(sess, 'funding', 'detached_award_procurement', args) logger.info("Procurement Update Complete") if args.tables in ('fabs', 'both'): update_table(sess, 'awarding', 'published_award_financial_assistance', args) update_table(sess, 'funding', 'published_award_financial_assistance', args) logger.info("Award Financial Assistance Update Complete")
def delete_submission(submission): """ Deletes all data associated with the specified submission NOTE: THERE IS NO WAY TO UNDO THIS """ if submission.publish_status_id != PUBLISH_STATUS_DICT['unpublished']: return JsonResponse.error(ValueError("Submissions that have been certified cannot be deleted"), StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session # Check if the submission has any jobs that are currently running, if so, do not allow deletion jobs = sess.query(Job).filter(Job.submission_id == submission.submission_id, Job.job_status_id == JOB_STATUS_DICT['running']).all() if jobs: return JsonResponse.error(ValueError("Submissions with running jobs cannot be deleted"), StatusCode.CLIENT_ERROR) sess.query(SubmissionSubTierAffiliation).filter( SubmissionSubTierAffiliation.submission_id == submission.submission_id).delete( synchronize_session=False) sess.query(Submission).filter(Submission.submission_id == submission.submission_id).delete( synchronize_session=False) sess.expire_all() return JsonResponse.create(StatusCode.OK, {"message": "Success"})
def load_cgac(file_name): """ Load CGAC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read """ sess = GlobalDB.db().session models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)} # read CGAC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, CGAC, {"cgac_agency_code": "cgac_code", "agency_name": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"cgac_code": {"pad_to_length": 3}} ) # de-dupe data.drop_duplicates(subset=['cgac_code'], inplace=True) delete_missing_cgacs(models, data) update_cgacs(models, data) sess.add_all(models.values()) sess.commit() logger.info('%s CGAC records inserted', len(models))
def submit_files(): file_manager = FileHandler(request, is_local=is_local, server_path=server_path) sess = GlobalDB.db().session start_date = request.json.get('reporting_period_start_date') end_date = request.json.get('reporting_period_end_date') is_quarter = request.json.get('is_quarter_format', False) if not (start_date is None or end_date is None): formatted_start_date, formatted_end_date = FileHandler.check_submission_dates(start_date, end_date, is_quarter) submissions = sess.query(Submission).filter( Submission.cgac_code == request.json.get('cgac_code'), Submission.frec_code == request.json.get('frec_code'), Submission.reporting_start_date == formatted_start_date, Submission.reporting_end_date == formatted_end_date, Submission.is_quarter_format == request.json.get('is_quarter'), Submission.publish_status_id != PUBLISH_STATUS_DICT['unpublished']) if 'existing_submission_id' in request.json: submissions.filter(Submission.submission_id != request.json['existing_submission_id']) submissions = submissions.order_by(desc(Submission.created_at)) if submissions.count() > 0: data = { "message": "A submission with the same period already exists.", "submissionId": submissions[0].submission_id } return JsonResponse.create(StatusCode.CLIENT_ERROR, data) return file_manager.submit(create_credentials)
def main(): sess = GlobalDB.db().session parser = argparse.ArgumentParser(description='Update business categories for existing transaction data') parser.add_argument('-fpds', help='Update only FPDS business categories', action='store_true') parser.add_argument('-fabs', help='Update only FABS business categories', action='store_true') args = parser.parse_args() overall_start = datetime.now() logger.info("Starting business categories updates") start = datetime.now() logger.info("Recreating business category SQL functions") sess.execute(BUSINESS_CATEGORY_FUNCTION_SQL) sess.commit() logger.info("Finished recreating business category SQL functions in %s seconds" % str(datetime.now() - start)) if not (args.fpds and args.fabs): update_fpds_business_categories(sess) update_fabs_business_categories(sess) elif args.fpds: update_fpds_business_categories(sess) elif args.fabs: update_fabs_business_categories(sess) logger.info("Completed business categories updates in %s seconds" % str(datetime.now() - overall_start))
def main(): """ Loads DUNS from the DUNS export file (comprised of DUNS pre-2014) """ parser = argparse.ArgumentParser(description='Adding historical DUNS to Broker.') parser.add_argument('-size', '--block_size', help='Number of rows to batch load', type=int, default=10000) args = parser.parse_args() sess = GlobalDB.db().session client = sam_config_is_valid() logger.info('Retrieving historical DUNS file') start = datetime.now() if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) duns_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['archive_bucket'], 'Key': "DUNS_export_deduped.csv"}, ExpiresIn=10000) else: duns_file = os.path.join(CONFIG_BROKER["broker_files"], "DUNS_export_deduped.csv") if not duns_file: raise OSError("No DUNS_export_deduped.csv found.") logger.info("Retrieved historical DUNS file in {} s".format((datetime.now()-start).total_seconds())) try: run_duns_batches(duns_file, sess, client, args.block_size) except Exception as e: logger.exception(e) sess.rollback() logger.info("Updating historical DUNS complete") sess.close()
def get_submission_data(submission, file_type=''): """ Get data for the submission specified Args: submission: submission to retrieve metadata for file_type: the type of job to retrieve metadata for Returns: JsonResponse containing the error information or the object containing metadata for all relevant file types """ sess = GlobalDB.db().session file_type = file_type.lower() # Make sure the file type provided is valid if file_type and file_type not in FILE_TYPE_DICT and file_type != 'cross': return JsonResponse.error(ValueError(file_type + ' is not a valid file type'), StatusCode.CLIENT_ERROR) # Make sure the file type provided is valid for the submission type is_fabs = submission.d2_submission if file_type and (is_fabs and file_type != 'fabs') or (not is_fabs and file_type == 'fabs'): return JsonResponse.error(ValueError(file_type + ' is not a valid file type for this submission'), StatusCode.CLIENT_ERROR) job_query = sess.query(Job).filter(Job.submission_id == submission.submission_id) if not file_type: relevant_job_types = (JOB_TYPE_DICT['csv_record_validation'], JOB_TYPE_DICT['validation']) job_query = job_query.filter(Job.job_type_id.in_(relevant_job_types)) elif file_type == 'cross': job_query = job_query.filter(Job.job_type_id == JOB_TYPE_DICT['validation']) else: job_query = job_query.filter(Job.file_type_id == FILE_TYPE_DICT[file_type]) job_dict = {'jobs': [job_to_dict(job) for job in job_query]} return JsonResponse.create(StatusCode.OK, job_dict)
def get_fabs_meta(submission_id): """Return the total rows, valid rows, publish date, and publish file for FABS submissions""" sess = GlobalDB.db().session # get row counts from the DetachedAwardFinancialAssistance table dafa = DetachedAwardFinancialAssistance total_rows = sess.query(dafa).filter(dafa.submission_id == submission_id) valid_rows = total_rows.filter(dafa.is_valid) # retrieve the published data and file submission = sess.query(Submission).filter(Submission.submission_id == submission_id).one() publish_date, published_file = None, None certify_data = get_lastest_certified_date(submission, is_fabs=True) try: iter(certify_data) except TypeError: publish_date = certify_data else: publish_date, file_path = certify_data if CONFIG_BROKER["use_aws"] and file_path: path, file_name = file_path.rsplit('/', 1) # split by last instance of / published_file = S3Handler().get_signed_url(path=path, file_name=file_name, bucket_route=CONFIG_BROKER['certified_bucket'], url_mapping=CONFIG_BROKER["certified_bucket_mapping"], method="get_object") elif file_path: published_file = file_path return { 'valid_rows': valid_rows.count(), 'total_rows': total_rows.count(), 'publish_date': publish_date.strftime('%-I:%M%p %m/%d/%Y') if publish_date else None, 'published_file': published_file }
def before_request(): # Set global value for local g.is_local = local sess = GlobalDB.db().session # setup user g.user = None if session.get('name') is not None: g.user = sess.query(User).filter_by(user_id=session['name']).one_or_none()
def update_tas_ids(model_class, submission_id): sess = GlobalDB.db().session submission = sess.query(Submission).filter_by(submission_id=submission_id).one() subquery = matching_cars_subquery(sess, model_class, submission.reporting_start_date, submission.reporting_end_date) sess.query(model_class).filter_by(submission_id=submission_id).\ update({getattr(model_class, 'tas_id'): subquery}, synchronize_session=False) sess.commit()
def test_w_perms_broker_user(create_session_mock, max_dict_mock, database, monkeypatch): ah = max_login_func(create_session_mock, max_dict_mock, monkeypatch, MAX_RESPONSE_W_PERMS) res = ah.max_login({}) sess = GlobalDB.db().session # This is to prevent an integrity error with other tests that create users. sess.query(func.lower(User.email) == func.lower("*****@*****.**")).delete() sess.commit() assert res is True
def get_window(): sess = GlobalDB.db().session curr_date = datetime.now().date() return sess.query(SubmissionWindow).filter( SubmissionWindow.start_date <= curr_date, SubmissionWindow.end_date >= curr_date)
def check_year_and_quarter(cgac_code, frec_code, reporting_fiscal_year, reporting_fiscal_period): """ Check if cgac (or frec) code, year, and quarter already has a published submission """ if not cgac_code and not frec_code: return JsonResponse.error(ValueError("CGAC or FR Entity Code required"), StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session return find_existing_submissions_in_period(sess, cgac_code, frec_code, reporting_fiscal_year, reporting_fiscal_period)
def load_tas(backfill_historic=False): """ Load TAS file into broker database. Args: backfill_historic: if set to true, this will only update certain columns if budget_function_code is null """ # read TAS file to dataframe, to make sure all is well with the file before firing up a db transaction tas_files = [] if CONFIG_BROKER["use_aws"]: # Storing version dictionaries in the list to prevent getting all the links at once and possibly work with # expired AWS links s3connection = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) # list_object_versions returns the versions in reverse chronological order if not backfill_historic: # get the latest tas_file tas_files = [s3connection.generate_presigned_url(ClientMethod='get_object', Params={'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': 'cars_tas.csv'}, ExpiresIn=600)] else: tas_files = s3connection.list_object_versions(Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix='cars_tas.csv')['Versions'] # getting the latest file (see the reversed) from each day for performance and accuracy tas_files_grouped = {tas_file['LastModified'].date(): tas_file for tas_file in reversed(tas_files)} # sorting them back to chronological order tas_files = sorted([tas_file for date, tas_file in tas_files_grouped.items()], key=lambda k: k['LastModified']) elif backfill_historic: logger.error('Unable to attain historical versions of cars_tas without aws access.') return else: tas_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "cars_tas.csv") tas_files.append(tas_file) sess = GlobalDB.db().session for tas_file in reversed(tas_files): update_missing = missing_records(sess) if backfill_historic else [] if backfill_historic and not update_missing: # no more missing, we're done here break if CONFIG_BROKER["use_aws"] and backfill_historic: # generate url from the version dict logger.info('Working with remote cars_tas.csv (from {})'.format(tas_file['LastModified'])) tas_file = s3connection.generate_presigned_url( ClientMethod='get_object', Params={ 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': 'cars_tas.csv', 'VersionId': tas_file['VersionId'] }, ExpiresIn=600 ) elif CONFIG_BROKER["use_aws"]: logger.info('Working with latest remote cars_tas.csv') else: logger.info('Working with local cars_tas.csv') update_tas_lookups(sess, tas_file, update_missing=update_missing)
def delete(self): sess = GlobalDB.db().session sess.delete(self.sqs) sess.commit()
def setUpClass(cls): """ Set up class-wide resources (test data) """ super(DetachedUploadTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test users sess = GlobalDB.db().session cls.session = sess admin_user = sess.query(User).filter( User.email == cls.test_users['admin_user']).one() agency_user = sess.query(User).filter( User.email == cls.test_users['agency_user']).one() editfabs_user = sess.query(User).filter( User.email == cls.test_users['editfabs_user']).one() cls.admin_user_id = admin_user.user_id cls.agency_user_id = agency_user.user_id cls.agency_user_email = agency_user.email cls.editfabs_email = editfabs_user.email # setup submission/jobs data for test_check_status cls.d2_submission = cls.insert_submission(sess, cls.admin_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=True) cls.d2_submission_2 = cls.insert_submission(sess, cls.agency_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=True) cls.published_submission = cls.insert_submission( sess, cls.admin_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=True, publish_status_id=PUBLISH_STATUS_DICT["published"]) cls.other_submission = cls.insert_submission(sess, cls.admin_user_id, cgac_code="SYS", start_date="07/2015", end_date="09/2015", is_quarter=True, d2_submission=False) cls.test_agency_user_submission_id = cls.insert_submission( sess, cls.agency_user_id, cgac_code="NOT", start_date="10/2015", end_date="12/2015", is_quarter=True, d2_submission=True) cls.insert_agency_user_submission_data( sess, cls.test_agency_user_submission_id)
def add_jobs_for_uploaded_file(upload_file, submission_id, existing_submission): """ Add upload and validation jobs for a single filetype Arguments: upload_file: UploadFile named tuple submission_id: submission ID to attach to jobs existing_submission: true if we should update existing jobs rather than creating new ones Returns: the validation job id for this file type (if any) the upload job id for this file type """ sess = GlobalDB.db().session file_type_id = FILE_TYPE_DICT[upload_file.file_type] validation_job_id = None # Create a file upload job or, for an existing submission, modify the # existing upload job. if existing_submission: # mark existing upload job as running upload_job = sess.query(Job).filter_by( submission_id=submission_id, file_type_id=file_type_id, job_type_id=JOB_TYPE_DICT['file_upload']).one() # mark as running and set new file name and path upload_job.job_status_id = JOB_STATUS_DICT['running'] upload_job.original_filename = upload_file.file_name upload_job.filename = upload_file.upload_name else: if upload_file.file_type in ["award", "award_procurement"]: # file generation handled on backend, mark as ready upload_status = JOB_STATUS_DICT['ready'] elif upload_file.file_type in ["executive_compensation", "sub_award"]: # these are dependent on file D2 validation upload_status = JOB_STATUS_DICT['waiting'] else: # mark as running since frontend should be doing this upload upload_status = JOB_STATUS_DICT['running'] upload_job = Job(original_filename=upload_file.file_name, filename=upload_file.upload_name, file_type_id=file_type_id, job_status_id=upload_status, job_type_id=JOB_TYPE_DICT['file_upload'], submission_id=submission_id) sess.add(upload_job) sess.flush() if existing_submission: # if the file's validation job is attached to an existing submission, # reset its status and delete any validation artifacts (e.g., error metadata) that # might exist from a previous run. val_job = sess.query(Job).filter_by( submission_id=submission_id, file_type_id=file_type_id, job_type_id=JOB_TYPE_DICT['csv_record_validation']).one() val_job.job_status_id = JOB_STATUS_DICT['waiting'] val_job.original_filename = upload_file.file_name val_job.filename = upload_file.upload_name # reset file size and number of rows to be set during validation of new file val_job.file_size = None val_job.number_of_rows = None # delete error metadata this might exist from a previous run of this validation job sess.query(ErrorMetadata).\ filter(ErrorMetadata.job_id == val_job.job_id).\ delete(synchronize_session='fetch') # delete file error information that might exist from a previous run of this validation job sess.query(File).filter(File.job_id == val_job.job_id).delete( synchronize_session='fetch') else: # create a new record validation job and add dependencies if necessary if upload_file.file_type == "executive_compensation": d1_val_job = sess.query(Job).\ filter(Job.submission_id == submission_id, Job.file_type_id == FILE_TYPE_DICT['award_procurement'], Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).\ one_or_none() if d1_val_job is None: logger.error({ 'message': "Cannot create E job without a D1 job", 'message_type': 'CoreError', 'submission_id': submission_id, 'file_type': 'E' }) raise Exception("Cannot create E job without a D1 job") # Add dependency on D1 validation job d1_dependency = JobDependency(job_id=upload_job.job_id, prerequisite_id=d1_val_job.job_id) sess.add(d1_dependency) elif upload_file.file_type == "sub_award": # todo: check for C validation job c_val_job = sess.query(Job).\ filter(Job.submission_id == submission_id, Job.file_type_id == FILE_TYPE_DICT['award_financial'], Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).\ one_or_none() if c_val_job is None: logger.error({ 'message': "Cannot create F job without a C job", 'message_type': 'CoreError', 'submission_id': submission_id, 'file_type': 'F' }) raise Exception("Cannot create F job without a C job") # add dependency on C validation job c_dependency = JobDependency(job_id=upload_job.job_id, prerequisite_id=c_val_job.job_id) sess.add(c_dependency) else: # E and F don't get validation jobs val_job = Job(original_filename=upload_file.file_name, filename=upload_file.upload_name, file_type_id=file_type_id, job_status_id=JOB_STATUS_DICT['waiting'], job_type_id=JOB_TYPE_DICT['csv_record_validation'], submission_id=submission_id) sess.add(val_job) sess.flush() # add dependency between file upload job and file validation job upload_dependency = JobDependency( job_id=val_job.job_id, prerequisite_id=upload_job.job_id) sess.add(upload_dependency) validation_job_id = val_job.job_id sess.commit() return validation_job_id, upload_job.job_id
def setUpClass(cls): """Set up resources to be shared within a test class""" cls.session_id = "" with create_validator_app().app_context(): # update application's db config options so unittests # run against test databases suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format( cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() # drop and re-create test user db/tables setup_user_db() # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db/tables setup_validation_db() # load e-mail templates setup_emails() # set up default e-mails for tests test_users = { 'admin_user': '******', 'agency_user': '******', 'agency_user_2': '*****@*****.**', 'no_permissions_user': '******', 'editfabs_user': '******' } user_password = '******' admin_password = '******' # get user info and save as class variables for use by tests sess = GlobalDB.db().session admin_cgac = CGAC(cgac_code='SYS', agency_name='Admin Agency') cls.admin_cgac_code = admin_cgac.cgac_code sess.add(admin_cgac) sess.commit() cgac = CGAC(cgac_code='000', agency_name='Example Agency') sess.add(cgac) sess.commit() frec = FREC(frec_code='0001', cgac_id=cgac.cgac_id, agency_name='Example FREC') sess.add(frec) sess.commit() sub_tier = SubTierAgency(cgac_id=cgac.cgac_id, frec_id=frec.frec_id, sub_tier_agency_code='0000', sub_tier_agency_name='Example Sub Tier') sess.add(sub_tier) # set up users for status tests def add_user(email, name, username, permission_type=ALL_PERMISSION_TYPES_DICT['writer'], website_admin=False): user = UserFactory(email=email, website_admin=website_admin, name=name, username=username, affiliations=[ UserAffiliation( cgac=cgac, permission_type_id=permission_type) ]) user.salt, user.password_hash = get_password_hash( user_password, Bcrypt()) sess.add(user) add_user(test_users['agency_user'], "Test User", "testUser") add_user(test_users['agency_user_2'], "Test User 2", "testUser2") add_user(test_users['editfabs_user'], "Fabs Writer", "fabsWriter", permission_type=ALL_PERMISSION_TYPES_DICT['editfabs']) # add new users create_user_with_password(test_users["admin_user"], admin_password, Bcrypt(), website_admin=True) create_user_with_password(test_users["no_permissions_user"], user_password, Bcrypt()) agency_user = sess.query(User).filter( User.email == test_users['agency_user']).one() cls.agency_user_id = agency_user.user_id sess.commit() # set up info needed by the individual test classes cls.test_users = test_users cls.user_password = user_password cls.admin_password = admin_password cls.local = CONFIG_BROKER['local']
def generate_file(submission, file_type, start, end, agency_type, file_format): """ Start a file generation job for the specified file type within a submission Args: submission: submission for which we're generating the file file_type: type of file to generate the job for start: the start date for the file to generate end: the end date for the file to generate agency_type: The type of agency (awarding or funding) to generate the file for (only used for D file generation) file_format: determines if the file generated is a txt or a csv (only used for D file generation) Returns: Results of check_generation or JsonResponse object containing an error if the prerequisite job isn't complete. """ error_message = None # submission is a FABS submission if submission.d2_submission: error_message = 'Cannot generate files for FABS submissions.' elif file_type in ['D1', 'D2']: # D file generation requires start and end date if not start or not end: error_message = 'Must have a start and end date for D file generation.' # D files can only be generated by awarding or funding agency elif agency_type not in ['awarding', 'funding']: error_message = 'agency_type must be either awarding or funding for D file generation.' elif file_format not in ['csv', 'txt']: error_message = 'file_format must be either csv or txt for D file generation.' # Only D1, D2, E, and F files can be generated elif file_type not in ['E', 'F']: error_message = 'File type must be either D1, D2, E, or F' # Return any client errors if error_message: return JsonResponse.error(ValueError(error_message), StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session job = sess.query(Job).filter( Job.submission_id == submission.submission_id, Job.file_type_id == lookups.FILE_TYPE_DICT_LETTER_ID[file_type], Job.job_type_id == lookups.JOB_TYPE_DICT['file_upload']).one() logger.info({ 'message': 'Starting {} file generation within submission {}'.format( file_type, submission.submission_id), 'message_type': 'BrokerInfo', 'submission_id': submission.submission_id, 'job_id': job.job_id, 'file_type': file_type }) # Check prerequisites on upload job if not generation_helper.check_generation_prereqs(submission.submission_id, file_type): return JsonResponse.error( ResponseException( 'Must wait for completion of prerequisite validation job', StatusCode.CLIENT_ERROR), StatusCode.CLIENT_ERROR) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(job, start, end, agency_type, file_format=file_format) else: generation_helper.start_e_f_generation(job) except Exception as e: mark_job_status(job.job_id, 'failed') job.error_message = str(e) sess.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_generation(submission, file_type)
def loadSql(cls, filename): """Load SQL-based validation rules to db.""" with createApp().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names rawFieldNames = header.split(',') fieldNames = [] # clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) unknownFields = set(fieldNames) - set(cls.headers) if len(unknownFields) != 0: raise KeyError("".join([ "Found unexpected fields: ", str(list(unknownFields)) ])) missingFields = set(cls.headers) - set(fieldNames) if len(missingFields) != 0: raise ValueError("".join([ "Missing required fields: ", str(list(missingFields)) ])) reader = csv.DictReader(csvfile, fieldnames=fieldNames) for row in reader: sql = cls.readSqlStr(row['query_name']) rule_sql = RuleSql( rule_sql=sql, rule_label=row['rule_label'], rule_description=row['rule_description'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: fileId = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided targetFileId = None else: targetFileId = FILE_TYPE_DICT[row["target_file"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["target_file"], row["rule_label"])) # set cross file flag if (FieldCleaner.cleanString(row["rule_cross_file_flag"]) in ['true', 't', 'y', 'yes']): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = RULE_SEVERITY_DICT[ row['severity_name']] rule_sql.file_id = fileId rule_sql.target_file_id = targetFileId rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10**decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data(import_data, model, DATA_CLEANING_MAP, {}) import_data["published_date"] = format_date( import_data["published_date"]) import_data["archived_date"] = format_date( import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
method.add_argument('-m', '--min_id', type=int, nargs=1, help="Load all data from a minimum id (0 for complete" " backfill)") method.add_argument( '-i', '--ids', type=int, nargs='+', help="Single or list of FSRS ids to populate the subaward table") with create_app().app_context(): logger.info("Begin backfilling Subaward table") sess = GlobalDB.db().session args = parser.parse_args() metrics_json = { 'script_name': 'populate_subaward_table.py', 'records_inserted': 0, 'start_time': str(now) } service_types = [] if not (args.procurements or args.grants): logger.error( 'FSRS types not provided. Please specify procurements, grants, or both.' ) sys.exit(1) if args.procurements:
def setUpClass(cls): """Set up resources to be shared within a test class""" # TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases configure_logging() suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() app = create_app() app.config['TESTING'] = True app.config['DEBUG'] = False cls.app = TestApp(app) sess = GlobalDB.db().session # set up default e-mails for tests test_users = { 'admin_user': '******', 'agency_user': '******', 'agency_user_2': '*****@*****.**', 'no_permissions_user': '******', 'editfabs_user': '******' } admin_password = '******' cgac = CGAC(cgac_code='000', agency_name='Example Agency') sess.add(cgac) sess.commit() # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db setup_validation_db() # setup Schema SchemaLoader.load_all_from_path(validator_config_path) load_sql_rules() create_user_with_password(test_users["admin_user"], admin_password, Bcrypt(), website_admin=True) cls.userId = None cls.test_users = test_users # constants to use for default submission start and end dates cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1) cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
def load_sub_tier_agencies(file_name, force_reload=False): """ Load Sub Tier Agency (sub_tier-level agency names) lookup table. Args: file_name: path/url to the file to be read force_reload: whether to reload regardless Returns: True if new data was loaded, False if the load was skipped """ sess = GlobalDB.db().session models = {sub_tier_agency.sub_tier_agency_code: sub_tier_agency for sub_tier_agency in sess.query(SubTierAgency)} # read Sub Tier Agency values from csv data = pd.read_csv(file_name, dtype=str) condition = data['TOPTIER_FLAG'] == 'TRUE' data.loc[condition, 'PRIORITY'] = 1 data.loc[~condition, 'PRIORITY'] = 2 data.replace({'TRUE': True, 'FALSE': False}, inplace=True) # clean data data = clean_data( data, SubTierAgency, {'cgac_agency_code': 'cgac_code', 'subtier_code': 'sub_tier_agency_code', 'priority': 'priority', 'frec': 'frec_code', 'subtier_name': 'sub_tier_agency_name', 'is_frec': 'is_frec'}, {'cgac_code': {'pad_to_length': 3}, 'frec_code': {'pad_to_length': 4}, 'sub_tier_agency_code': {'pad_to_length': 4}} ) # de-dupe data.drop_duplicates(subset=['sub_tier_agency_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data['cgac_code'])).all()} cgac_dict_flipped = {cgac_id: cgac_code for cgac_code, cgac_id in cgac_dict.items()} frec_dict = {str(frec.frec_code): frec.frec_id for frec in sess.query(FREC).filter(FREC.frec_code.in_(data['frec_code'])).all()} frec_dict_flipped = {frec_id: frec_code for frec_code, frec_id in frec_dict.items()} # compare to existing content in table def int_to_float(row): return float(row['priority']) def extract_cgac(row): return cgac_dict_flipped[row['cgac_id']] if row['cgac_id'] in cgac_dict_flipped else np.nan def extract_frec(row): return frec_dict_flipped[row['frec_id']] if row['frec_id'] in frec_dict_flipped else np.nan diff_found = check_dataframe_diff(data, SubTierAgency, ['sub_tier_agency_id', 'cgac_id', 'frec_id'], ['sub_tier_agency_code'], lambda_funcs=[('priority', int_to_float), ('cgac_code', extract_cgac), ('frec_code', extract_frec)]) if force_reload or diff_found: delete_missing_sub_tier_agencies(models, data) update_sub_tier_agencies(models, data, cgac_dict, frec_dict) sess.add_all(models.values()) sess.commit() logger.info('%s Sub Tier Agency records inserted', len(models)) return True else: logger.info('No differences found, skipping subtier table reload.') return False
def perms_to_affiliations(perms, user_id, service_account_flag=False): """ Convert a list of perms from MAX to a list of UserAffiliations. Filter out and log any malformed perms Args: perms: list of permissions (as strings) for the user user_id: the ID of the user service_account_flag: flag to indicate a service account Yields: UserAffiliations based on the permissions provided """ available_cgacs = {cgac.cgac_code: cgac for cgac in GlobalDB.db().session.query(CGAC)} available_frecs = {frec.frec_code: frec for frec in GlobalDB.db().session.query(FREC)} log_data = { 'message_type': 'BrokerWarning', 'user_id': user_id } for perm in perms: log_data['message'] = 'User with ID {} has malformed permission: {}'.format(user_id, perm) components = perm.split('-PERM_') if len(components) != 2: logger.warning(log_data) continue codes, perm_level = components split_codes = codes.split('-FREC_') frec_code, cgac_code = None, None if len(split_codes) == 2: # permissions for FR entity code and readonly CGAC frec_code, cgac_code = split_codes[1], split_codes[0] if frec_code not in available_frecs or cgac_code not in available_cgacs: logger.warning(log_data) continue else: # permissions for CGAC cgac_code = codes if cgac_code not in available_cgacs: logger.warning(log_data) continue perm_level = perm_level.lower() if service_account_flag: # Replace MAX Service Account permissions with Broker "write" and "editfabs" permissions perm_level = 'we' elif perm_level not in 'rwsef': logger.warning(log_data) continue for permission in perm_level: if frec_code: yield UserAffiliation( cgac=available_cgacs[cgac_code], frec=None, permission_type_id=PERMISSION_SHORT_DICT['r'] ) yield UserAffiliation( cgac=None, frec=available_frecs[frec_code], permission_type_id=PERMISSION_SHORT_DICT[permission] ) else: yield UserAffiliation( cgac=available_cgacs[cgac_code] if cgac_code else None, frec=None, permission_type_id=PERMISSION_SHORT_DICT[permission] )
def max_login(self, session): """ Logs a user in if their password matches using MAX Args: session: Session object from flask Returns: A JsonResponse containing the user information or details on which error occurred, such as whether a type was wrong, something wasn't implemented, invalid keys were provided, login was denied, or a different, unexpected error occurred. """ try: safe_dictionary = RequestDictionary(self.request) ticket = safe_dictionary.get_value("ticket") service = safe_dictionary.get_value('service') # Call MAX's serviceValidate endpoint and retrieve the response max_dict = get_max_dict(ticket, service) if 'cas:authenticationSuccess' not in max_dict['cas:serviceResponse']: raise ValueError("The Max CAS endpoint was unable to locate your session " "using the ticket/service combination you provided.") cas_attrs = max_dict['cas:serviceResponse']['cas:authenticationSuccess']['cas:attributes'] # Grab MAX ID to see if a service account is being logged in max_id_components = cas_attrs['maxAttribute:MAX-ID'].split('_') service_account_flag = (len(max_id_components) > 1 and max_id_components[0].lower() == 's') # Grab the email and list of groups from MAX's response email = cas_attrs['maxAttribute:Email-Address'] try: sess = GlobalDB.db().session user = sess.query(User).filter(func.lower(User.email) == func.lower(email)).one_or_none() # If the user does not exist, create them since they are allowed to access the site because they got # past the above group membership checks if user is None: user = User() user.email = email set_user_name(user, cas_attrs) set_max_perms(user, cas_attrs['maxAttribute:GroupList'], service_account_flag) sess.add(user) sess.commit() except MultipleResultsFound: raise ValueError("An error occurred during login.") return self.create_session_and_response(session, user) # Catch any specifically raised errors or any other errors that may have happened and return them cleanly. # We add the error parameter here because this endpoint needs to provide better feedback, and to avoid changing # the default behavior of the JsonResponse class globally. except (TypeError, KeyError, NotImplementedError) as e: # Return a 400 with appropriate message return JsonResponse.error(e, StatusCode.CLIENT_ERROR, error=str(e)) except ValueError as e: # Return a 401 for login denied return JsonResponse.error(e, StatusCode.LOGIN_REQUIRED, error=str(e)) except Exception as e: # Return 500 return JsonResponse.error(e, StatusCode.INTERNAL_ERROR, error=str(e))
def get_cgacs_without_sub_tier_agencies(sess=None): if sess is None: sess = GlobalDB.db().session cgac_sub_tiers = sess.query(SubTierAgency).distinct(SubTierAgency.cgac_id) return sess.query(CGAC).filter( CGAC.cgac_id.notin_([st.cgac.cgac_id for st in cgac_sub_tiers])).all()
def load_program_activity_data(base_path): """ Load program activity lookup table. Args: base_path: directory of domain config files """ last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()): return program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: ' + PA_FILE_NAME) with create_app().app_context(): sess = GlobalDB.db().session try: data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError as e: log_blank_file() sys.exit( 4 ) # exit code chosen arbitrarily, to indicate distinct failure states headers = set([header.upper() for header in list(data)]) if not VALID_HEADERS.issubset(headers): logger.error( "Missing required headers. Required headers include: %s" % str(VALID_HEADERS)) sys.exit(4) try: dropped_count, data = clean_data( data, ProgramActivity, { "fyq": "fiscal_year_quarter", "agency_code": "agency_id", "allocation_id": "allocation_transfer_id", "account_code": "account_number", "pa_code": "program_activity_code", "pa_title": "program_activity_name" }, { "program_activity_code": { "pad_to_length": 4 }, "agency_id": { "pad_to_length": 3 }, "allocation_transfer_id": { "pad_to_length": 3, "keep_null": True }, "account_number": { "pad_to_length": 4 } }, [ "agency_id", "program_activity_code", "account_number", "program_activity_name" ], True) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() sys.exit(4) else: count_str = "Application tried to drop {} rows".format(e.count) logger.error( "Loading of program activity file failed due to exceeded failure threshold. " + count_str) sys.exit(5) sess.query(ProgramActivity).delete() # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply( lambda x: lowercase_or_notify(x)) # because we're only loading a subset of program activity info, # there will be duplicate records in the dataframe. this is ok, # but need to de-duped before the db load. We also need to log them. base_count = data.shape[0] data.drop_duplicates(inplace=True) logger.info("Dropped {} duplicate rows.".format(base_count - data.shape[0])) # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() set_stored_pa_last_upload(last_upload) logger.info('{} records inserted to {}'.format(num, table_name)) if dropped_count > 0: sys.exit(3)
def purge(): sess = GlobalDB.db().session sess.query(SQS).delete() sess.commit()
def teardown_appcontext(exception): GlobalDB.close()
def receive_messages(WaitTimeSeconds, MessageAttributeNames=None): # noqa sess = GlobalDB.db().session messages = [] for sqs in sess.query(SQS): messages.append(SQSMockMessage(sqs)) return messages
def setUpClass(cls): """Set up class-wide resources.""" super(MixedFileTests, cls).setUpClass() user = cls.userId force_tas_load = False with createApp().app_context(): # get the submission test user sess = GlobalDB.db().session # Create test submissions and jobs, also uploading # the files needed for each job. statusReadyId = JOB_STATUS_DICT['ready'] jobTypeCsvId = JOB_TYPE_DICT['csv_record_validation'] jobDict = {} # next three jobs belong to the same submission and are tests # for single-file validations that contain failing rows submissionId = cls.insertSubmission(sess, user) job_info = Job(filename=cls.uploadFile("appropMixed.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['mixed'] = job_info.job_id job_info = Job(filename=cls.uploadFile("programActivityMixed.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['program_activity'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['programMixed'] = job_info.job_id job_info = Job(filename=cls.uploadFile("awardMixed.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['awardMixed'] = job_info.job_id # next job tests single-file validations for award_financial # (submission has a non-Q1 end date) submissionId = cls.insertSubmission(sess, user, datetime(2015, 3, 15)) job_info = Job(filename=cls.uploadFile("awardFinancialMixed.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award_financial'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['awardFinMixed'] = job_info.job_id # job below tests a file that has a mixed-delimiter heading submissionId = cls.insertSubmission(sess, user) job_info = Job(filename=cls.uploadFile("awardMixedDelimiter.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['awardMixedDelimiter'] = job_info.job_id # next five jobs are cross-file and belong to the same submission submissionId = cls.insertSubmission(sess, user) job_info = Job(filename=cls.uploadFile("cross_file_A.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['crossApprop'] = job_info.job_id job_info = Job(filename=cls.uploadFile("cross_file_B.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['program_activity'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['crossPgmAct'] = job_info.job_id job_info = Job(filename=cls.uploadFile("cross_file_C.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award_financial'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['crossAwardFin'] = job_info.job_id job_info = Job(filename=cls.uploadFile("cross_file_D2.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['crossAward'] = job_info.job_id job_info = Job(job_status_id=statusReadyId, job_type_id=JOB_TYPE_DICT['validation'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['crossFile'] = job_info.job_id # next four jobs test short columns names and belong to the same submission submissionId = cls.insertSubmission(sess, user) job_info = Job(filename=cls.uploadFile("appropValidShortcols.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['appropValidShortcols'] = job_info.job_id job_info = Job(filename=cls.uploadFile( "programActivityMixedShortcols.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['program_activity'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['programMixedShortcols'] = job_info.job_id job_info = Job(filename=cls.uploadFile( "awardFinancialMixedShortcols.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award_financial'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['awardFinMixedShortcols'] = job_info.job_id job_info = Job(filename=cls.uploadFile("awardValidShortcols.csv", user), job_status_id=statusReadyId, job_type_id=jobTypeCsvId, file_type_id=FILE_TYPE_DICT['award'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['awardValidShortcols'] = job_info.job_id # commit submissions/jobs and output IDs sess.commit() for job_type, job_id in jobDict.items(): print('{}: {}'.format(job_type, job_id)) # Load fields and rules FileTypeTests.load_definitions(sess, force_tas_load, cls.RULES_TO_APPLY) cls.jobDict = jobDict
def test_password_reset_email(self): """Test password reset email.""" self.logout() email = self.test_users["password_reset_email"] postJson = {"email": email} response = self.app.post_json( "/v1/reset_password/", postJson, headers={"x-session-id": self.session_id}) self.check_response(response, StatusCode.OK) # Test password reset for unapproved user and locked user with createApp().app_context(): sess = GlobalDB.db().session user = sess.query(User).filter(User.email == email).one() user.user_status_id = self.userStatusDict['awaiting_approval'] sess.commit() response = self.app.post_json( "/v1/reset_password/", postJson, headers={"x-session-id": self.session_id}, expect_errors=True) self.check_response(response, StatusCode.CLIENT_ERROR) user.user_status_id = self.userStatusDict['approved'] user.is_active = False sess.commit() response = self.app.post_json( "/v1/reset_password/", postJson, headers={"x-session-id": self.session_id}, expect_errors=True) self.check_response(response, StatusCode.CLIENT_ERROR) # Test route to confirm tokens token = sesEmail.createToken( self.test_users["password_reset_email"], "password_reset") postJson = {"token": token} response = self.app.post_json( "/v1/confirm_password_token/", postJson, headers={"x-session-id": self.session_id}) self.check_response(response, StatusCode.OK, "success") self.assertEqual(response.json["errorCode"], sesEmail.LINK_VALID) postJson = {"user_email": email, "password": self.user_password} response = self.app.post_json( "/v1/set_password/", postJson, headers={"x-session-id": self.session_id}) self.check_response(response, StatusCode.OK, "Password successfully changed") user = sess.query(User).filter(User.email == email).one() self.assertTrue(user.password_hash) # Call again, should error postJson = {"user_email": email, "password": self.user_password} response = self.app.post_json( "/v1/set_password/", postJson, headers={"x-session-id": self.session_id}, expect_errors=True) self.check_response(response, StatusCode.LOGIN_REQUIRED)
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, numWarnings=0, warningFileSize=None): """ Runs a validation test Args: jobId: ID of job for this validation statusId: Expected HTTP status code for this test statusName: Expected status in job tracker, False if job should not exist fileSize: Expected file size of error report, False if error report should not exist stagingRows: Expected number of rows in validation db staging tables. False if no rows are expected errorStatus: Expected status in file table of error DB, False if file object should not exist numErrors: Expected number of errors rowErrorsPresent: Checks flag for whether row errors occurred, None to skip the check Returns: """ with createApp().app_context(): sess = GlobalDB.db().session response = self.validateJob(jobId) self.assertEqual(response.status_code, statusId, str(self.getResponseInfo(response))) # get the job from db job = sess.query(Job).filter(Job.job_id == jobId).one() if statusName is not False: self.assertEqual(job.job_status_id, JOB_STATUS_DICT[statusName]) self.assertEqual(response.headers.get("Content-Type"), "application/json") # Check valid row count for this job if stagingRows is not False: self.assertEqual(job.number_of_rows_valid, stagingRows) if errorStatus is not False: self.assertEqual( sess.query(File).filter( File.job_id == jobId).one().file_status_id, FILE_STATUS_DICT[errorStatus]) self.assertEqual(checkNumberOfErrorsByJobId(jobId, 'fatal'), numErrors) self.assertEqual(checkNumberOfErrorsByJobId(jobId, 'warning'), numWarnings) if fileSize is not False: reportPath = getReportPath(job, 'error') if self.local: self.assertFileSizeAppxy(fileSize, reportPath) else: self.assertGreater( s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), fileSize - 5) self.assertLess( s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), fileSize + 5) if warningFileSize is not None and warningFileSize is not False: reportPath = getReportPath(job, 'warning') if self.local: self.assertFileSizeAppxy(warningFileSize, reportPath) else: self.assertGreater( s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), warningFileSize - 5) self.assertLess( s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), warningFileSize + 5) return response
def tearDownClass(cls): """Tear down class-level resources.""" GlobalDB.close() drop_database(CONFIG_DB['db_name'])
def main(): now = datetime.now() parser = argparse.ArgumentParser( description='Pull data from the Federal Hierarchy API.') parser.add_argument('-a', '--all', help='Clear out the database and get historical data', action='store_true') parser.add_argument('-f', '--filename', help='Generate a local CSV file from the data.', nargs=1, type=str) parser.add_argument('-o', '--export_office', help='Export the current office table. ' 'Please provide the file name/path.', nargs=1, type=str) parser.add_argument('-d', '--pull_date', help='Date from which to start the pull', nargs=1, type=str) parser.add_argument('-i', '--ignore_db', help='Do not update the DB tables', action='store_true') args = parser.parse_args() if args.all and args.pull_date: logger.error("The -a and -d flags conflict, cannot use both at once.") sys.exit(1) metrics_json = { 'script_name': 'load_federal_hierarchy.py', 'start_time': str(now), 'level_1_records': 0, 'level_2_records': 0, 'level_3_records': 0, 'level_4_records': 0, 'level_5_records': 0, 'level_6_records': 0, 'level_7_records': 0 } # Handle the pull_date parameter updated_date_from = None if args.pull_date: try: updated_date_from = args.pull_date[0] datetime.strptime(updated_date_from, "%Y-%m-%d") except ValueError: logger.error("The date given to the -d flag was not parseable.") sys.exit(1) # Get or create the start date sess = GlobalDB.db().session if not args.all and not updated_date_from: last_pull_date = sess.query(func.max(Office.updated_at)).one_or_none() if not last_pull_date: logger.error( "The -a or -d flag must be set when there are no Offices present in the database." ) sys.exit(1) updated_date_from = last_pull_date[0].date() # Handle the filename parameter filename = args.filename[0] if args.filename else None # Handle the export office parameter export_office = args.export_office[0] if args.export_office else None # Handle a complete data reload if args.all and not args.ignore_db: logger.info("Emptying out the Office table for a complete reload.") sess.execute('''TRUNCATE TABLE office RESTART IDENTITY''') try: pull_offices(sess, filename, not args.ignore_db, args.all, updated_date_from, export_office, metrics_json) except Exception as e: logger.error(str(e)) sys.exit(1) metrics_json['duration'] = str(datetime.now() - now) with open('load_federal_hierarchy_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info("Script complete")
def test_cross_file(self): crossId = self.jobDict["crossFile"] # Run jobs for A, B, C, and D2, then cross file validation job # Note: test files used for cross validation use the short column names # as a way to ensure those are handled correctly by the validator awardFinResponse = self.validateJob(self.jobDict["crossAwardFin"]) self.assertEqual(awardFinResponse.status_code, 200, msg=str(awardFinResponse.json)) awardResponse = self.validateJob(self.jobDict["crossAward"]) self.assertEqual(awardResponse.status_code, 200, msg=str(awardResponse.json)) appropResponse = self.validateJob(self.jobDict["crossApprop"]) self.assertEqual(appropResponse.status_code, 200, msg=str(appropResponse.json)) pgmActResponse = self.validateJob(self.jobDict["crossPgmAct"]) self.assertEqual(pgmActResponse.status_code, 200, msg=str(pgmActResponse.json)) crossFileResponse = self.validateJob(crossId) self.assertEqual(crossFileResponse.status_code, 200, msg=str(crossFileResponse.json)) with createApp().app_context(): sess = GlobalDB.db().session job = sess.query(Job).filter(Job.job_id == crossId).one() # Check number of cross file validation errors in DB for this job self.assertEqual(checkNumberOfErrorsByJobId(crossId, "fatal"), 0) self.assertEqual(checkNumberOfErrorsByJobId(crossId, "warning"), 3) self.assertEqual(job.job_status_id, JOB_STATUS_DICT['finished']) # Check that cross file validation report exists and is the right size submissionId = job.submission_id sizePathPairs = [ (89, get_cross_report_name(submissionId, "appropriations", "program_activity")), (89, get_cross_report_name(submissionId, "award_financial", "award")), (2363, get_cross_warning_report_name(submissionId, "appropriations", "program_activity")), (89, get_cross_warning_report_name(submissionId, "award_financial", "award")), ] for size, path in sizePathPairs: if self.local: self.assertFileSizeAppxy(size, path) else: self.assertGreater(s3UrlHandler.getFileSize("errors/" + path), size - 5) self.assertLess(s3UrlHandler.getFileSize("errors/" + path), size + 5)
def clear_sessions(): """ Removes old sessions that are expired """ new_time = to_unix_time(datetime.utcnow()) sess = GlobalDB.db().session sess.query(SessionMap).filter(SessionMap.expiration < new_time).delete() sess.commit()
def setup_user_db(): """Create user tables from model metadata.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()
def setup_validation_db(): """Create validation tables from model metadata and do initial inserts.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()
def validate_file_by_sql(job, file_type, short_to_long_dict): """ Check all SQL rules Args: job: the Job which is running file_type: file type being checked short_to_long_dict: mapping of short to long schema column names Returns: List of ValidationFailures """ sql_val_start = datetime.now() log_string = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(job.submission_id), str(job.job_id), job.file_type.name) logger.info({ 'message': 'Beginning SQL validations {}'.format(log_string), 'message_type': 'ValidatorInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': job.file_type.name, 'action': 'run_sql_validations', 'status': 'start', 'start_time': sql_val_start }) sess = GlobalDB.db().session # Pull all SQL rules for this file type file_id = FILE_TYPE_DICT[file_type] rules = sess.query(RuleSql).filter_by(file_id=file_id, rule_cross_file_flag=False) errors = [] # For each rule, execute sql for rule for rule in rules: rule_start = datetime.now() logger.info({ 'message': 'Beginning SQL validation rule {} {}'.format( rule.query_name, log_string), 'message_type': 'ValidatorInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'rule': rule.query_name, 'file_type': job.file_type.name, 'action': 'run_sql_validation_rule', 'status': 'start', 'start_time': rule_start }) failures = sess.execute(rule.rule_sql.format(job.submission_id)) if failures.rowcount: # Create column list (exclude row_number) cols = failures.keys() cols.remove("row_number") col_headers = [ short_to_long_dict.get(field, field) for field in cols ] # materialize as we'll iterate over the failures twice failures = list(failures) flex_data = relevant_flex_data(failures, job.job_id) errors.extend( failure_row_to_tuple(rule, flex_data, cols, col_headers, file_id, failure) for failure in failures) rule_duration = (datetime.now() - rule_start).total_seconds() logger.info({ 'message': 'Completed SQL validation rule {} {}'.format( rule.query_name, log_string), 'message_type': 'ValidatorInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'rule': rule.query_name, 'file_type': job.file_type.name, 'action': 'run_sql_validation_rule', 'status': 'finish', 'start_time': rule_start, 'end_time': datetime.now(), 'duration': rule_duration }) sql_val_duration = (datetime.now() - sql_val_start).total_seconds() logger.info({ 'message': 'Completed SQL validations {}'.format(log_string), 'message_type': 'ValidatorInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': job.file_type.name, 'action': 'run_sql_validations', 'status': 'finish', 'start_time': sql_val_start, 'end_time': datetime.now(), 'duration': sql_val_duration }) return errors
def cross_validate_sql(rules, submission_id, short_to_long_dict, first_file, second_file, job, error_csv, warning_csv, error_list, job_id): """ Evaluate all sql-based rules for cross file validation Args: rules -- List of Rule objects submission_id -- ID of submission to run cross-file validation """ conn = GlobalDB.db().connection # Put each rule through evaluate, appending all failures into list for rule in rules: rule_start = datetime.now() logger.info({ 'message': 'Beginning cross-file rule {} on submission_id: {}'.format( rule.query_name, str(submission_id)), 'message_type': 'ValidatorInfo', 'rule': rule.query_name, 'job_id': job.job_id, 'submission_id': submission_id, 'action': 'run_cross_validation_rule', 'status': 'start', 'start': rule_start }) failed_rows = conn.execute(rule.rule_sql.format(submission_id)) logger.info({ 'message': 'Finished running cross-file rule {} on submission_id: {}.'.format( rule.query_name, str(submission_id)) + 'Starting flex field gathering and file writing', 'message_type': 'ValidatorInfo', 'rule': rule.query_name, 'job_id': job.job_id, 'submission_id': submission_id }) if failed_rows.rowcount: # get list of fields involved in this validation # note: row_number is metadata, not a field being # validated, so exclude it cols = failed_rows.keys() cols.remove('row_number') column_string = ", ".join( short_to_long_dict[c] if c in short_to_long_dict else c for c in cols) # materialize as we'll iterate over the failed_rows twice failed_rows = list(failed_rows) num_failed_rows = len(failed_rows) slice_start = 0 slice_size = 10000 while slice_start <= num_failed_rows: # finding out row numbers for logger last_error_curr_slice = slice_start + slice_size failed_row_subset = failed_rows[ slice_start:last_error_curr_slice] if last_error_curr_slice > num_failed_rows: last_error_curr_slice = num_failed_rows logger.info({ 'message': 'Starting flex field gathering for cross-file rule ' + '{} on submission_id: {} for '.format( rule.query_name, str(submission_id)) + 'failure rows: {}-{}'.format(str(slice_start), str(last_error_curr_slice)), 'message_type': 'ValidatorInfo', 'rule': rule.query_name, 'job_id': job.job_id, 'submission_id': submission_id }) flex_data = relevant_cross_flex_data(failed_row_subset, submission_id, [first_file, second_file]) logger.info({ 'message': 'Finished flex field gathering for cross-file rule ' + '{} on submission_id: {} for '.format( rule.query_name, str(submission_id)) + 'failure rows: {}-{}'.format(str(slice_start), str(last_error_curr_slice)), 'message_type': 'ValidatorInfo', 'rule': rule.query_name, 'job_id': job.job_id, 'submission_id': submission_id }) for row in failed_row_subset: # get list of values for each column values = [ "{}: {}".format(short_to_long_dict[c], str(row[c])) if c in short_to_long_dict else "{}: {}".format( c, str(row[c])) for c in cols ] values = ", ".join(values) full_column_string = column_string # go through all flex fields in this row and add to the columns and values for field in flex_data[row['row_number']]: full_column_string += ", " + field.header + "_file" +\ FILE_TYPE_DICT_LETTER[field.file_type_id].lower() values += ", {}: {}".format( field.header + "_file" + FILE_TYPE_DICT_LETTER[field.file_type_id].lower(), field.cell) target_file_type = FILE_TYPE_DICT_ID[rule.target_file_id] failure = [ rule.file.name, target_file_type, full_column_string, str(rule.rule_error_message), values, row['row_number'], str(rule.rule_label), rule.file_id, rule.target_file_id, rule.rule_severity_id ] if failure[9] == RULE_SEVERITY_DICT['fatal']: error_csv.writerow(failure[0:7]) if failure[9] == RULE_SEVERITY_DICT['warning']: warning_csv.writerow(failure[0:7]) error_list.record_row_error(job_id, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) slice_start = slice_start + slice_size rule_duration = (datetime.now() - rule_start).total_seconds() logger.info({ 'message': 'Completed cross-file rule {} on submission_id: {}'.format( rule.query_name, str(submission_id)), 'message_type': 'ValidatorInfo', 'rule': rule.query_name, 'job_id': job.job_id, 'submission_id': submission_id, 'action': 'run_cross_validation_rule', 'status': 'finish', 'start': rule_start, 'duration': rule_duration })
def before_request(): GlobalDB.db()
def generate_detached_file(file_type, cgac_code, frec_code, start_date, end_date, year, period, agency_type, file_format): """ Start a file generation job for the specified file type not connected to a submission Args: file_type: type of file to be generated cgac_code: the code of a CGAC agency if generating for a CGAC agency frec_code: the code of a FREC agency if generating for a FREC agency start_date: start date in a string, formatted MM/DD/YYYY end_date: end date in a string, formatted MM/DD/YYYY year: year to generate for, integer 4 digits period: period to generate for, integer (2-12) agency_type: The type of agency (awarding or funding) to generate the file for file_format: determines if the file generated is a txt or a csv (only used for D file generation) Returns: JSONResponse object with keys job_id, status, file_type, url, message, start_date, and end_date. Raises: ResponseException: if the start_date and end_date Strings cannot be parsed into dates """ # Make sure it's a valid request if not cgac_code and not frec_code: return JsonResponse.error( ValueError( "Detached file generation requires CGAC or FR Entity Code"), StatusCode.CLIENT_ERROR) if file_type in ['D1', 'D2']: # Make sure we have a start and end date for D1/D2 generation if not start_date or not end_date: return JsonResponse.error( ValueError( 'Must have a start and end date for D file generation.'), StatusCode.CLIENT_ERROR) # Check if date format is MM/DD/YYYY if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException( 'Start or end date cannot be parsed into a date', StatusCode.CLIENT_ERROR) if agency_type not in ['awarding', 'funding']: return JsonResponse.error( ValueError('agency_type must be either awarding or funding.'), StatusCode.CLIENT_ERROR) if file_format not in ['csv', 'txt']: return JsonResponse.error( ValueError('file_format must be either csv or txt.'), StatusCode.CLIENT_ERROR) else: # Make sure both year and period are provided if not (year and period): return JsonResponse.error( ValueError( "Must have a year and period for A file generation."), StatusCode.CLIENT_ERROR) try: # Convert to real start and end dates start_date, end_date = generic_helper.year_period_to_dates( year, period) except ResponseException as e: return JsonResponse.error(e, StatusCode.CLIENT_ERROR) # Add job info file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type] new_job = generation_helper.create_generation_job(file_type_name, start_date, end_date) agency_code = frec_code if frec_code else cgac_code logger.info({ 'message': 'Starting detached {} file generation'.format(file_type), 'message_type': 'BrokerInfo', 'job_id': new_job.job_id, 'file_type': file_type, 'agency_code': agency_code, 'start_date': start_date, 'end_date': end_date }) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(new_job, start_date, end_date, agency_type, agency_code=agency_code, file_format=file_format) else: generation_helper.start_a_generation(new_job, start_date, end_date, agency_code) except Exception as e: mark_job_status(new_job.job_id, 'failed') new_job.error_message = str(e) GlobalDB.db().session.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_detached_generation(new_job.job_id)