def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id job_start = datetime.now() logger.info({ 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start' }) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by( rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter( or_( and_(RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_(RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # get error file name/path error_file_name = report_file_name(submission_id, False, first_file.name, second_file.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(submission_id, True, first_file.name, second_file.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # open error report and gather failed rules within it with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # write headers to file error_csv.writerow(self.crossFileReportHeaders) warning_csv.writerow(self.crossFileReportHeaders) # send comboRules to validator.crossValidate sql current_cols_short_to_long = self.short_to_long_dict[ first_file.id].copy() current_cols_short_to_long.update( self.short_to_long_dict[second_file.id].copy()) cross_validate_sql(combo_rules.all(), submission_id, current_cols_short_to_long, first_file.id, second_file.id, job, error_csv, warning_csv, error_list, job_id) # close files error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(error_file_name)), 'w') as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(warning_file_name)), 'w') as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as "finished" mark_job_status(job_id, "finished") job_duration = (datetime.now() - job_start).total_seconds() logger.info({ 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration }) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id)
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() logger.info( { 'message': 'Beginning run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start}) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name)) warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name)) # Create File Status object create_file_if_needed(job_id, file_name) reader = self.get_reader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name, self.long_to_short_dict, is_local=self.isLocal) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info( { 'message': 'Beginning data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start}) with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer: while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time}) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, writer, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type in ["detached_award"]: record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \ (record['awarding_sub_tier_agency_c'] or '-none-') + \ (record['fain'] or '-none-') + (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate(record, csv_schema, file_type in ["detached_award"]) if valid: # todo: update this logic later when we have actual validations if file_type in ["detached_award"]: record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model(model_instance, job, writer, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer, row_number, error_list) if fatal: error_rows.append(row_number) loading_duration = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Completed data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer, warning_writer, row_number, error_list) error_rows.extend(sql_error_rows) # Write unfinished batch writer.finish_batch() warning_writer.finish_batch() # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update detached_award is_valid rows where applicable # Update submission to include action dates where applicable if file_type in ["detached_award"]: sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type in ["detached_award"]: # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the file always closes reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info( { 'message': 'Completed run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] job_start = datetime.now() logger.info( { 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start'}) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by(rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter(or_(and_( RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_( RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # send comboRules to validator.crossValidate sql failures = cross_validate_sql(combo_rules.all(), submission_id, self.short_to_long_dict, first_file.id, second_file.id, job) # get error file name report_filename = self.get_file_name(report_file_name(submission_id, False, first_file.name, second_file.name)) warning_report_filename = self.get_file_name(report_file_name(submission_id, True, first_file.name, second_file.name)) # loop through failures to create the error report with self.get_writer(region_name, bucket_name, report_filename, self.crossFileReportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_report_filename, self.crossFileReportHeaders) as \ warning_writer: for failure in failures: if failure[9] == RULE_SEVERITY_DICT['fatal']: writer.write(failure[0:7]) if failure[9] == RULE_SEVERITY_DICT['warning']: warning_writer.write(failure[0:7]) error_list.record_row_error(job_id, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) # write the last unfinished batch writer.finish_batch() warning_writer.finish_batch() # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as "finished" mark_job_status(job_id, "finished") job_duration = (datetime.now()-job_start).total_seconds() logger.info( { 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration}) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id)
def load_file_data(self, sess, bucket_name, region_name): """ Loads in the file data and performs initial validations Args: sess: the database connection bucket_name: the bucket to pull the file region_name: the region to pull the file """ loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) # Extension Check extension = os.path.splitext(self.file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException('', StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Base file check file_row_count, self.short_rows, self.long_rows = simple_file_scan(self.reader, bucket_name, region_name, self.file_name) # total_rows = header + long_rows (and will be added on per chunk) # Note: we're adding long_rows here because pandas will exclude long_rows when we're loading the data self.total_rows = 1 + len(self.long_rows) # Making base error/warning files self.error_file_name = report_file_name(self.submission_id, False, self.file_type.name) self.error_file_path = ''.join([CONFIG_SERVICES['error_report_path'], self.error_file_name]) self.warning_file_name = report_file_name(self.submission_id, True, self.file_type.name) self.warning_file_path = ''.join([CONFIG_SERVICES['error_report_path'], self.warning_file_name]) with open(self.error_file_path, 'w', newline='') as error_file, \ open(self.warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') error_csv.writerow(self.report_headers) warning_csv.writerow(self.report_headers) # Adding formatting errors to error file format_error_df = process_formatting_errors(self.short_rows, self.long_rows, self.report_headers) format_error_df.to_csv(self.error_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) # Finally open the file for loading into the database with baseline validations self.reader.open_file(region_name, bucket_name, self.file_name, self.fields, bucket_name, self.get_file_name(self.error_file_name), self.daims_to_short_dict[self.file_type.file_type_id], self.short_to_daims_dict[self.file_type.file_type_id], is_local=self.is_local) # Going back to reprocess the header row self.reader.file.seek(0) reader_obj = pd.read_csv(self.reader.file, dtype=str, delimiter=self.reader.delimiter, error_bad_lines=False, keep_default_na=False, chunksize=CHUNK_SIZE, warn_bad_lines=False) for chunk_df in reader_obj: self.process_data_chunk(sess, chunk_df) # Ensure validated rows match initial row count if file_row_count != self.total_rows: raise ResponseException('', StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': self.total_rows }) return file_row_count