def stream_file_to_s3(upload_name, reader, is_certified=False): """Stream file to S3 Args: upload_name - file name to be used as S3 key reader - reader object to read data from is_certified - True if writing to the certified bucket, False otherwise (default False) """ path, file_name = upload_name.rsplit('/', 1) logger.debug({ 'message': 'Streaming file to S3', 'message_type': 'ValidatorDebug', 'file_name': file_name if file_name else path }) if is_certified: handler = S3Handler.create_file_path(upload_name, CONFIG_BROKER["certified_bucket"]) else: handler = S3Handler.create_file_path(upload_name) with smart_open.smart_open(handler, 'w') as writer: while True: chunk = reader.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break
def copy_file_from_parent_to_child(child_job, parent_job, is_local): """ Copy the file from the parent job's bucket to the child job's bucket. Args: child_job: Job object for the child FileRequest parent_job: Job object for the parent FileRequest is_local: A boolean flag indicating whether the application is being run locally or not """ file_type = parent_job.file_type.letter_name log_data = {'message': 'Copying data from parent job with job_id:{}'.format(parent_job.job_id), 'message_type': 'ValidatorInfo', 'job_id': child_job.job_id, 'file_type': parent_job.file_type.name} if not is_local: is_local = g.is_local if not is_local and parent_job.filename != child_job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'], Prefix=child_job.filename) for obj in response.get('Contents', []): if obj['Key'] == child_job.filename: # The file already exists in this location log_data['message'] = 'Cached {} file CSV already exists in this location'.format(file_type) logger.info(log_data) return # Copy the parent file into the child's S3 location log_data['message'] = 'Copying the cached {} file from job {}'.format(file_type, parent_job.job_id) logger.info(log_data) S3Handler.copy_file(CONFIG_BROKER['aws_bucket'], CONFIG_BROKER['aws_bucket'], parent_job.filename, child_job.filename)
def check_file_generation(job_id): """ Check the status of a file generation Args: job_id: upload Job ID Return: Dict with keys: job_id, status, file_type, message, url, start, end """ sess = GlobalDB.db().session # We want to use one_or_none() here so we can see if the job is None so we can mark the status as invalid to # indicate that a status request is invoked for a job that isn't created yet upload_job = sess.query(Job).filter_by(job_id=job_id).one_or_none() response_dict = { 'job_id': job_id, 'status': '', 'file_type': '', 'message': '', 'url': '#', 'size': None } if upload_job is None: response_dict['start'] = '' response_dict['end'] = '' response_dict['status'] = 'invalid' response_dict[ 'message'] = 'No generation job found with the specified ID' return response_dict response_dict['file_type'] = lookups.FILE_TYPE_DICT_LETTER[ upload_job.file_type_id] response_dict['size'] = upload_job.file_size response_dict['status'] = map_generate_status(sess, upload_job) response_dict['message'] = upload_job.error_message or '' # Generate the URL (or path) to the file if CONFIG_BROKER['use_aws'] and response_dict[ 'status'] is 'finished' and upload_job.filename: path, file_name = upload_job.filename.split('/') response_dict['url'] = S3Handler().get_signed_url( path=path, file_name=file_name, bucket_route=None, url_mapping=CONFIG_BROKER["submission_bucket_mapping"], method='get_object') elif response_dict['status'] is 'finished' and upload_job.filename: response_dict['url'] = upload_job.filename # Only D file generations have start and end dates if response_dict['file_type'] in ['D1', 'D2']: response_dict['start'] = upload_job.start_date.strftime( "%m/%d/%Y") if upload_job.start_date is not None else "" response_dict['end'] = upload_job.end_date.strftime( "%m/%d/%Y") if upload_job.end_date is not None else "" return response_dict
def generate_file(self, agency_code=None): """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """ raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else GEN_FILENAMES[self.file_type].format( self.file_generation.agency_type)) file_name = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: file_path = "".join([CONFIG_BROKER['broker_files'], file_name]) else: file_path = "".join(["None/", file_name]) # Generate the file and upload to S3 log_data = { 'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo', 'file_type': self.file_type, 'file_path': file_path } if self.file_generation: self.generate_d_file(file_path) log_data.update({ 'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type, 'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date, 'file_generation_id': self.file_generation.file_generation_id }) elif self.job.file_type.letter_name in ['A', 'E', 'F']: log_data['job_id'] = self.job.job_id mark_job_status(self.job.job_id, 'running') if self.job.file_type.letter_name == 'A': if not agency_code: raise ResponseException( 'Agency code not provided for an A file generation') self.generate_a_file(agency_code, file_path) else: # Call self.generate_%s_file() where %s is e or f based on the Job's file_type file_type_lower = self.job.file_type.letter_name.lower() getattr(self, 'generate_%s_file' % file_type_lower)() mark_job_status(self.job.job_id, 'finished') else: e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \ 'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty') raise ResponseException(e) logger.info(log_data)
def generate_from_job(self, job_id, agency_code): """ Generates a file for a specified job Args: job_id: ID of the upload Job agency_code: FREC or CGAC code to generate data from """ mark_job_status(job_id, 'running') with job_context(job_id, self.is_local) as context: sess, job = context # Ensure this is a file generation job if job.job_type.name != 'file_upload': raise ResponseException( 'Job ID {} is not a file generation job (job type is {})'. format(job.job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Ensure there is an available agency_code if not agency_code: if job.submission_id: agency_code = job.submission.frec_code if job.submission.frec_code else job.submission.cgac_code else: raise ResponseException( 'An agency_code must be provided to generate a file'. format(job.job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Generate timestamped file names old_filename = job.original_filename job.original_filename = S3Handler.get_timestamped_filename( CONFIG_BROKER["".join([str(job.file_type.name), "_file_name"])]) if self.is_local: job.filename = "".join( [CONFIG_BROKER['broker_files'], job.original_filename]) else: job.filename = "".join( [str(job.submission_id), "/", job.original_filename]) # Generate the file and upload to S3 if job.file_type.letter_name in ['D1', 'D2']: # Update the validation Job if necessary if job.submission_id: self.update_validation_job_info(job) generate_d_file(sess, job, agency_code, self.is_local, old_filename) elif job.file_type.letter_name == 'E': generate_e_file(sess, job, self.is_local) else: generate_f_file(sess, job, self.is_local)
def generate_from_job(self): """ Generates a file for a specified job """ # Mark Job as running mark_job_status(self.job.job_id, 'running') # Ensure this is a file generation job job_type = self.job.job_type.name if job_type != 'file_upload': raise ResponseException( 'Job ID {} is not a file generation job (job type is {})'.format(self.job.job_id, job_type), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Ensure there is an available agency_code if not self.agency_code: raise ResponseException( 'An agency_code must be provided to generate a file'.format(self.job.job_id, job_type), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Retrieve any FileRequest that may have started since the Broker sent the request to SQS skip_generation = None if self.job.file_type.letter_name in ['D1', 'D2']: skip_generation = retrieve_cached_file_request(self.job, self.agency_type, self.agency_code, self.is_local) if not skip_generation: # Generate timestamped file names raw_filename = CONFIG_BROKER["".join([str(self.job.file_type.name), "_file_name"])] self.job.original_filename = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: self.job.filename = "".join([CONFIG_BROKER['broker_files'], self.job.original_filename]) else: self.job.filename = "".join([str(self.job.submission_id), "/", self.job.original_filename]) self.sess.commit() # Generate the file, and upload to S3 if self.job.file_type.letter_name in ['D1', 'D2']: # Update the validation Job if necessary update_validation_job_info(self.sess, self.job) self.generate_d_file() elif self.job.file_type.letter_name == 'A': self.generate_a_file() elif self.job.file_type.letter_name == 'E': self.generate_e_file() else: self.generate_f_file() mark_job_status(self.job.job_id, 'finished') logger.info({ 'message': 'Finished file {} generation'.format(self.job.file_type.letter_name), 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code, 'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date, 'end_date': self.job.end_date, 'filename': self.job.original_filename })
def get_fabs_meta(submission_id): """Return the total rows, valid rows, publish date, and publish file for FABS submissions""" sess = GlobalDB.db().session # get row counts from the DetachedAwardFinancialAssistance table dafa = DetachedAwardFinancialAssistance total_rows = sess.query(dafa).filter(dafa.submission_id == submission_id) valid_rows = total_rows.filter(dafa.is_valid) # retrieve the published data and file submission = sess.query(Submission).filter( Submission.submission_id == submission_id).one() publish_date, published_file = None, None certify_data = get_lastest_certified_date(submission, is_fabs=True) try: iter(certify_data) except TypeError: publish_date = certify_data else: publish_date, file_path = certify_data if CONFIG_BROKER["use_aws"] and file_path: path, file_name = file_path.rsplit( '/', 1) # split by last instance of / published_file = S3Handler().get_signed_url( path=path, file_name=file_name, bucket_route=CONFIG_BROKER['certified_bucket'], url_mapping=CONFIG_BROKER["certified_bucket_mapping"], method="get_object") elif file_path: published_file = file_path return { 'valid_rows': valid_rows.count(), 'total_rows': total_rows.count(), 'publish_date': publish_date.strftime('%-I:%M%p %m/%d/%Y') if publish_date else None, 'published_file': published_file }
def generate_file(self, agency_code=None): """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """ raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else GEN_FILENAMES[self.file_type].format(self.file_generation.agency_type)) file_name = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: file_path = "".join([CONFIG_BROKER['broker_files'], file_name]) else: file_path = "".join(["None/", file_name]) # Generate the file and upload to S3 log_data = {'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo', 'file_type': self.file_type, 'file_path': file_path} if self.file_generation: self.generate_d_file(file_path) log_data.update({ 'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type, 'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date, 'file_generation_id': self.file_generation.file_generation_id }) elif self.job.file_type.letter_name in ['A', 'E', 'F']: log_data['job_id'] = self.job.job_id mark_job_status(self.job.job_id, 'running') if self.job.file_type.letter_name == 'A': if not agency_code: raise ResponseException('Agency code not provided for an A file generation') self.generate_a_file(agency_code, file_path) else: # Call self.generate_%s_file() where %s is e or f based on the Job's file_type file_type_lower = self.job.file_type.letter_name.lower() getattr(self, 'generate_%s_file' % file_type_lower)() mark_job_status(self.job.job_id, 'finished') else: e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \ 'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty') raise ResponseException(e) logger.info(log_data)
def get_fabs_meta(submission_id): """Return the total rows, valid rows, publish date, and publish file for FABS submissions""" sess = GlobalDB.db().session # get row counts from the FABS table total_rows = sess.query(FABS).filter(FABS.submission_id == submission_id) valid_rows = total_rows.filter(FABS.is_valid) # retrieve the published data and file submission = sess.query(Submission).filter( Submission.submission_id == submission_id).one() publish_date, published_file = None, None publish_data = get_latest_published_date(submission, is_fabs=True) try: iter(publish_data) except TypeError: publish_date = publish_data else: publish_date, file_path = publish_data if CONFIG_BROKER['use_aws'] and file_path: path, file_name = file_path.rsplit( '/', 1) # split by last instance of / published_file = S3Handler().get_signed_url( path=path, file_name=file_name, bucket_route=CONFIG_BROKER['certified_bucket'], url_mapping=CONFIG_BROKER['certified_bucket_mapping']) elif file_path: published_file = file_path return { 'valid_rows': valid_rows.count(), 'total_rows': total_rows.count(), 'publish_date': publish_date.strftime('%Y-%m-%dT%H:%M:%S') if publish_date else None, 'published_file': published_file }
def revert_to_certified(submission, file_manager): """ Revert an updated DABS submission to its last certified state Args: submission: the submission to be reverted file_manager: a FileHandler object to be used to call revert_certified_error_files and determine is_local Returns: A JsonResponse containing a success message Raises: ResponseException: if submission provided is a FABS submission or is not in an "updated" status """ if submission.d2_submission: raise ResponseException('Submission must be a DABS submission.', status=StatusCode.CLIENT_ERROR) if submission.publish_status_id != PUBLISH_STATUS_DICT['updated']: raise ResponseException('Submission has not been certified or has not been updated since certification.', status=StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session move_certified_data(sess, submission.submission_id, direction='revert') # Copy file paths from certified_files_history max_cert_history = sess.query(func.max(CertifyHistory.certify_history_id), func.max(CertifyHistory.updated_at)).\ filter(CertifyHistory.submission_id == submission.submission_id).one() remove_timestamp = [str(FILE_TYPE_DICT['appropriations']), str(FILE_TYPE_DICT['program_activity']), str(FILE_TYPE_DICT['award_financial'])] if file_manager.is_local: filepath = CONFIG_BROKER['broker_files'] ef_path = '' else: filepath = '{}/'.format(submission.submission_id) ef_path = filepath remove_timestamp.extend([str(FILE_TYPE_DICT['executive_compensation']), str(FILE_TYPE_DICT['sub_award'])]) # Certified filename -> Job filename, original filename # Local: # A/B/C: # filename -> '[broker_files dir][certified file base name]' # original_filename -> '[certified file base name without the timestamp]' # D1/D2: # filename -> '[broker_files dir][certified file base name]' # original_filename -> '[certified file base name]' # E/F: # filename -> '[certified file base name]' # original_filename -> '[certified file base name]' # Remote: # A/B/C/E/F: # filename -> '[submission_id]/[certified file base name]' # original_filename -> '[certified file base name without the timestamp]' # D1/D2: # filename -> '[submission_id dir][certified file base name]' # original_filename -> '[certified file base name]' update_string = """ WITH filenames AS ( SELECT REVERSE(SPLIT_PART(REVERSE(filename), '/', 1)) AS simple_name, file_type_id FROM certified_files_history WHERE certify_history_id = {history_id} ) UPDATE job SET filename = CASE WHEN job.file_type_id NOT IN (6, 7) THEN '{filepath}' ELSE '{ef_path}' END || simple_name, original_filename = CASE WHEN job.file_type_id NOT IN ({remove_timestamp}) THEN simple_name ELSE substring(simple_name, position('_' in simple_name) + 1) END FROM filenames WHERE job.file_type_id = filenames.file_type_id AND job.submission_id = {submission_id}; """.format(history_id=max_cert_history[0], filepath=filepath, ef_path=ef_path, remove_timestamp=', '.join(remove_timestamp), submission_id=submission.submission_id) sess.execute(update_string) # Set errors/warnings for the submission submission.number_of_errors = 0 submission.number_of_warnings =\ sess.query(func.coalesce(func.sum(CertifiedErrorMetadata.occurrences), 0).label('total_warnings')).\ join(Job, CertifiedErrorMetadata.job_id == Job.job_id).\ filter(Job.submission_id == submission.submission_id).one().total_warnings submission.publishable = True # Set default numbers/status/last validation date for jobs then update warnings sess.query(Job).filter_by(submission_id=submission.submission_id).\ update({'number_of_errors': 0, 'number_of_warnings': 0, 'job_status_id': JOB_STATUS_DICT['finished'], 'last_validated': max_cert_history[1], 'error_message': None, 'file_generation_id': None}) # Get list of jobs so we can update them job_list = sess.query(Job).\ filter(Job.submission_id == submission.submission_id, Job.job_type_id.in_([JOB_TYPE_DICT['csv_record_validation'], JOB_TYPE_DICT['validation']]), Job.file_type_id.notin_([FILE_TYPE_DICT['sub_award'], FILE_TYPE_DICT['executive_compensation']])).all() # Fixing File table job_ids = [str(job.job_id) for job in job_list] update_string = """ UPDATE file SET filename = job.filename, file_status_id = 1, headers_missing = NULL, headers_duplicated = NULL FROM job WHERE job.job_id = file.job_id AND job.job_id IN ({job_ids}); """.format(job_ids=', '.join(job_ids)) sess.execute(update_string) file_type_mapping = { FILE_TYPE_DICT['appropriations']: CertifiedAppropriation, FILE_TYPE_DICT['program_activity']: CertifiedObjectClassProgramActivity, FILE_TYPE_DICT['award_financial']: CertifiedAwardFinancial, FILE_TYPE_DICT['award']: CertifiedAwardFinancialAssistance, FILE_TYPE_DICT['award_procurement']: CertifiedAwardProcurement } # Update the number of warnings for each job in the list for job in job_list: job.number_of_warnings = sess.query(func.coalesce(func.sum(CertifiedErrorMetadata.occurrences), 0). label('total_warnings')). \ filter_by(job_id=job.job_id).one().total_warnings # For non-cross-file jobs, also update the row count and file size if job.job_type_id != JOB_TYPE_DICT['validation']: file_type_model = file_type_mapping[job.file_type_id] total_rows = sess.query(file_type_model).filter_by(submission_id=submission.submission_id).count() job.number_of_rows = total_rows + 1 job.number_of_rows_valid = total_rows if file_manager.is_local: # local file size try: job.file_size = os.path.getsize(job.filename) except: logger.warning("File doesn't exist locally: %s", job.filename) job.file_size = 0 else: # boto file size job.file_size = S3Handler.get_file_size(job.filename) # Set submission to certified status submission.publish_status_id = PUBLISH_STATUS_DICT['published'] sess.commit() # Move warning files back non-locally and clear out error files for all environments file_manager.revert_certified_error_files(sess, max_cert_history[0]) return JsonResponse.create(StatusCode.OK, {'message': 'Submission {} successfully reverted to certified status.'. format(submission.submission_id)})
def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id job_start = datetime.now() logger.info({ 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start' }) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by( rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter( or_( and_(RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_(RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # get error file name/path error_file_name = report_file_name(submission_id, False, first_file.name, second_file.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(submission_id, True, first_file.name, second_file.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # open error report and gather failed rules within it with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # write headers to file error_csv.writerow(self.crossFileReportHeaders) warning_csv.writerow(self.crossFileReportHeaders) # send comboRules to validator.crossValidate sql current_cols_short_to_long = self.short_to_long_dict[ first_file.id].copy() current_cols_short_to_long.update( self.short_to_long_dict[second_file.id].copy()) cross_validate_sql(combo_rules.all(), submission_id, current_cols_short_to_long, first_file.id, second_file.id, job, error_csv, warning_csv, error_list, job_id) # close files error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(error_file_name)), 'w') as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(warning_file_name)), 'w') as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as "finished" mark_job_status(job_id, "finished") job_duration = (datetime.now() - job_start).total_seconds() logger.info({ 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration }) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id)
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def copy_file_generation_to_job(job, file_generation, is_local): """ Copy cached FileGeneration data to a Job requesting a file. Args: job: Job object to copy the data to file_generation: Cached FileGeneration object to copy the data from is_local: A boolean flag indicating whether the application is being run locally or not """ sess = GlobalDB.db().session log_data = { 'message': 'Copying FileGeneration {} data to Job {}'.format( file_generation.file_generation_id, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name, 'file_generation_id': file_generation.file_generation_id } logger.info(log_data) # Do not edit submissions that have already successfully completed sess.refresh(job) if job.job_status_id == lookups.JOB_STATUS_DICT['finished']: return job.file_generation_id = file_generation.file_generation_id # File is still being generated, just mark the FileGeneration ID in the Job and wait # FileGeneration will update all child Jobs when it finishes if not file_generation.file_path: sess.commit() return # Generate file path for child Job's filename filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format( str(job.submission_id)) original_filename = file_generation.file_path.split('/')[-1] filename = '{}{}'.format(filepath, original_filename) # Copy parent job's data job.filename = filename job.original_filename = original_filename job.number_of_errors = 0 job.number_of_warnings = 0 # Change the validation job's file data when within a submission if job.submission_id is not None: val_job = sess.query(Job).filter( Job.submission_id == job.submission_id, Job.file_type_id == job.file_type_id, Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = original_filename # Copy the data to the Submission's bucket if not g.is_local and file_generation.file_path != job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) bucket = CONFIG_BROKER['aws_bucket'] response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename) for obj in response.get('Contents', []): if obj['Key'] == job.filename: # The file already exists in this location log_data[ 'message'] = '{} file already exists in this location: {}; not overwriting.'.format( job.file_type.name, job.filename) logger.info(log_data) mark_job_status(job.job_id, 'finished') return S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename) sess.commit() # Mark Job status last so the validation job doesn't start until everything is done mark_job_status(job.job_id, 'finished')
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session self.job = job self.submission_id = job.submission_id self.file_type = job.file_type self.file_name = job.filename self.is_fabs = (self.file_type.name == 'fabs') # initializing processing metadata vars for a new validation self.reader = CsvReader() self.error_list = ErrorInterface() self.error_rows = [] self.max_row_number = 1 self.total_rows = 0 self.short_rows = [] self.long_rows = [] validation_start = datetime.now() bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(self.submission_id), str(self.job.job_id), self.file_type.name) logger.info({ 'message': 'Beginning run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete() sess.commit() # Clear existing records for this submission sess.query(self.model).filter_by(submission_id=self.submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=self.job.job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) create_file_if_needed(self.job.job_id, self.file_name) # Get file size and write to jobs table if CONFIG_BROKER['use_aws']: file_size = S3Handler.get_file_size(self.file_name) else: file_size = os.path.getsize(self.file_name) self.job.file_size = file_size sess.commit() # Get fields for this file self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\ .order_by(FileColumn.daims_name.asc()).all() self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields) self.csv_schema = {row.name_short: row for row in self.fields} try: # Loading data and initial validations self.load_file_data(sess, bucket_name, region_name) if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(self.model, self.submission_id) # SQL Validations with open(self.error_file_path, 'a', newline='') as error_file, \ open(self.warning_file_path, 'a', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # third phase of validations: run validation rules as specified in the schema guidance. These # validations are sql-based. sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id], error_csv, warning_csv) self.error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: s3_resource = boto3.resource('s3', region_name=region_name) # stream error file with open(self.error_file_path, 'rb') as csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file) csv_file.close() os.remove(self.error_file_path) # stream warning file with open(self.warning_file_path, 'rb') as warning_csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file) warning_csv_file.close() os.remove(self.warning_file_path) # Calculate total number of rows in file that passed validations error_rows_unique = set(self.error_rows) total_rows_excluding_header = self.total_rows - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if self.is_fabs: sess.query(DetachedAwardFinancialAssistance). \ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == self.submission_id). \ update({'is_valid': False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(self.submission_id) sess.query(Submission).filter(Submission.submission_id == self.submission_id). \ update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date}, synchronize_session=False) # Update job metadata self.job.number_of_rows = self.total_rows self.job.number_of_rows_valid = valid_rows sess.commit() self.error_list.write_all_row_errors(self.job.job_id) # Update error info for submission populate_job_error_info(self.job) if self.is_fabs: # set number of errors and warnings for detached submission populate_submission_error_info(self.submission_id) # Mark validation as finished in job tracker mark_job_status(self.job.job_id, 'finished') mark_file_complete(self.job.job_id, self.file_name) except Exception: logger.error({ 'message': 'An exception occurred during validation', 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'traceback': traceback.format_exc() }) raise finally: # Ensure the files always close self.reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() logger.info( { 'message': 'Beginning run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start}) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name)) warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name)) # Create File Status object create_file_if_needed(job_id, file_name) reader = self.get_reader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name, self.long_to_short_dict, is_local=self.isLocal) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info( { 'message': 'Beginning data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start}) with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer: while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time}) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, writer, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type in ["detached_award"]: record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \ (record['awarding_sub_tier_agency_c'] or '-none-') + \ (record['fain'] or '-none-') + (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate(record, csv_schema, file_type in ["detached_award"]) if valid: # todo: update this logic later when we have actual validations if file_type in ["detached_award"]: record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model(model_instance, job, writer, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer, row_number, error_list) if fatal: error_rows.append(row_number) loading_duration = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Completed data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer, warning_writer, row_number, error_list) error_rows.extend(sql_error_rows) # Write unfinished batch writer.finish_batch() warning_writer.finish_batch() # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update detached_award is_valid rows where applicable # Update submission to include action dates where applicable if file_type in ["detached_award"]: sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type in ["detached_award"]: # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the file always closes reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info( { 'message': 'Completed run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def copy_file_generation_to_job(job, file_generation, is_local): """ Copy cached FileGeneration data to a Job requesting a file. Args: job: Job object to copy the data to file_generation: Cached FileGeneration object to copy the data from is_local: A boolean flag indicating whether the application is being run locally or not """ sess = GlobalDB.db().session log_data = { 'message': 'Copying FileGeneration {} data to Job {}'.format(file_generation.file_generation_id, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name, 'file_generation_id': file_generation.file_generation_id} logger.info(log_data) # Do not edit submissions that have already successfully completed sess.refresh(job) if job.job_status_id == lookups.JOB_STATUS_DICT['finished']: return job.file_generation_id = file_generation.file_generation_id # File is still being generated, just mark the FileGeneration ID in the Job and wait # FileGeneration will update all child Jobs when it finishes if not file_generation.file_path: sess.commit() return # Generate file path for child Job's filename filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(str(job.submission_id)) original_filename = file_generation.file_path.split('/')[-1] filename = '{}{}'.format(filepath, original_filename) # Copy parent job's data job.filename = filename job.original_filename = original_filename job.number_of_errors = 0 job.number_of_warnings = 0 # Change the validation job's file data when within a submission if job.submission_id is not None: val_job = sess.query(Job).filter(Job.submission_id == job.submission_id, Job.file_type_id == job.file_type_id, Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = original_filename # Copy the data to the Submission's bucket if not g.is_local and file_generation.file_path != job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) bucket = CONFIG_BROKER['aws_bucket'] response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename) for obj in response.get('Contents', []): if obj['Key'] == job.filename: # The file already exists in this location log_data['message'] = '{} file already exists in this location: {}; not overwriting.'.format( job.file_type.name, job.filename) logger.info(log_data) mark_job_status(job.job_id, 'finished') return S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename) sess.commit() # Mark Job status last so the validation job doesn't start until everything is done mark_job_status(job.job_id, 'finished')
def copy_parent_file_request_data(sess, child_job, parent_job, is_local): """Parent FileRequest job data to the child FileRequest job data. Args: sess: current DB session child_job: Job ID for the child FileRequest object parent_job: Job ID for the parent FileRequest object is_local: True if in local development, False otherwise """ file_type = parent_job.file_type.letter_name log_data = { 'message': 'Copying data from parent job with job_id:{}'.format( parent_job.job_id), 'message_type': 'ValidatorInfo', 'job_id': child_job.job_id, 'file_type': parent_job.file_type.name } # Keep path but update file name filename = '{}/{}'.format( child_job.filename.rsplit('/', 1)[0], parent_job.original_filename) # Copy parent job's data child_job.from_cached = True child_job.filename = filename child_job.original_filename = parent_job.original_filename child_job.number_of_errors = parent_job.number_of_errors child_job.number_of_warnings = parent_job.number_of_warnings child_job.error_message = parent_job.error_message # Change the validation job's file data when within a submission if child_job.submission_id is not None: val_job = sess.query(Job).filter( Job.submission_id == child_job.submission_id, Job.file_type_id == parent_job.file_type_id, Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = parent_job.original_filename sess.commit() if not is_local and parent_job.filename != child_job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'], Prefix=child_job.filename) for obj in response.get('Contents', []): if obj['Key'] == child_job.filename: # The file already exists in this location log_data[ 'message'] = 'Cached {} file CSV already exists in this location'.format( file_type) logger.info(log_data) return # Copy the parent file into the child's S3 location log_data['message'] = 'Copying the cached {} file from job {}'.format( file_type, parent_job.job_id) logger.info(log_data) with smart_open.smart_open( S3Handler.create_file_path(parent_job.filename), 'r') as reader: stream_file_to_s3(child_job.filename, reader) # Mark job status last so the validation job doesn't start until everything is done mark_job_status(child_job.job_id, JOB_STATUS_DICT_ID[parent_job.job_status_id])
max_cert_id = sess.query(func.max(CertifyHistory.certify_history_id).label('cert_id')). \ filter_by(submission_id=submission_id).one() route_vars = [ agency_code, submission.reporting_fiscal_year, submission.reporting_fiscal_period // 3, max_pub_id.pub_id ] new_route = '/'.join([str(var) for var in route_vars]) + '/' if not is_local: old_path = '{}/{}'.format(str(submission_id), filename) new_path = new_route + filename # Copy the file if it's a non-local submission S3Handler().copy_file( original_bucket=CONFIG_BROKER['aws_bucket'], new_bucket=CONFIG_BROKER['certified_bucket'], original_path=old_path, new_path=new_path) else: new_path = "".join([CONFIG_BROKER['broker_files'], filename]) # add published history file_history = PublishedFilesHistory( publish_history_id=max_pub_id.pub_id, certify_history_id=max_cert_id.cert_id, submission_id=submission_id, filename=new_path, file_type_id=None, comment=None, warning_filename=None) sess.add(file_history)