def test_schema_optional_field(self): """Test optional fields.""" schema = self.schema record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "1", "test5": "1", } self.assertTrue(Validator.validate(record, schema)) record["test5"] = "" self.assertTrue(Validator.validate(record, schema)) record["test5"] = "s" self.assertTrue(Validator.validate(record, schema)) record["test5"] = "" record["test3"] = "" self.assertTrue(Validator.validate(record, schema))
def test_schema_optional_field(self): """Test optional fields.""" schema = self.schema interfaces = self.interfaces record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "1", "test5": "1", } self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "s" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" record["test3"] = "" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0])
def test_schema_optional_field(self): """Test optional fields.""" schema = self.schema interfaces = self.interfaces record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "1", "test5": "1", } self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "s" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" record["test3"] = "" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0])
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() logger.info( { 'message': 'Beginning run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start}) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name)) warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name)) # Create File Status object create_file_if_needed(job_id, file_name) reader = self.get_reader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name, self.long_to_short_dict, is_local=self.isLocal) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info( { 'message': 'Beginning data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start}) with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer: while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time}) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, writer, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type in ["detached_award"]: record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \ (record['awarding_sub_tier_agency_c'] or '-none-') + \ (record['fain'] or '-none-') + (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate(record, csv_schema, file_type in ["detached_award"]) if valid: # todo: update this logic later when we have actual validations if file_type in ["detached_award"]: record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model(model_instance, job, writer, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer, row_number, error_list) if fatal: error_rows.append(row_number) loading_duration = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Completed data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer, warning_writer, row_number, error_list) error_rows.extend(sql_error_rows) # Write unfinished batch writer.finish_batch() warning_writer.finish_batch() # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update detached_award is_valid rows where applicable # Update submission to include action dates where applicable if file_type in ["detached_award"]: sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type in ["detached_award"]: # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the file always closes reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info( { 'message': 'Completed run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ sess = GlobalDB.db().session # get the job object here so we can call the refactored getReportPath # todo: replace other db access functions with job object attributes job = sess.query(Job).filter(Job.job_id == jobId).one() CloudLogger.logError("VALIDATOR_INFO: ", "Beginning runValidation on jobID: " + str(jobId), "") jobTracker = interfaces.jobDb submissionId = jobTracker.getSubmissionId(jobId) rowNumber = 1 fileType = jobTracker.getFileType(jobId) # Clear existing records for this submission interfaces.stagingDb.clearFileBySubmission(submissionId, fileType) # Get short to long colname dictionary shortColnames = interfaces.validationDb.getShortToLongColname() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(getReportPath(job, 'error')) warningFileName = self.getFileName(getReportPath(job, 'warning')) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType, shortCols=True) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) fields = interfaces.validationDb.getFileColumnsByFile(fileType) try: # Pull file and return info on whether it's using short or long col headers reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) errorInterface = interfaces.errorDb self.longToShortDict = interfaces.validationDb.getLongToShortColname( ) # rowErrorPresent becomes true if any row error occurs, used for determining file status rowErrorPresent = False # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.isFinished: rowNumber += 1 if (rowNumber % 100) == 0: CloudLogger.logError( "VALIDATOR_INFO: ", "JobId: " + str(jobId) + " loading row " + str(rowNumber), "") # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere) = self.readRecord(reader, writer, fileType, interfaces, rowNumber, jobId, fields) if reduceRow: rowNumber -= 1 if rowErrorHere: rowErrorPresent = True errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, jobId, submissionId, passedValidations, interfaces, writer, rowNumber, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, interfaces, jobId, shortColnames, writer, warningWriter, rowNumber): errorRows.append(rowNumber) CloudLogger.logError( "VALIDATOR_INFO: ", "Loading complete on jobID: " + str(jobId) + ". Total rows added to staging: " + str(rowNumber), "") # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata jobTracker.setJobRowcounts(jobId, rowNumber, validRows) errorInterface.writeAllRowErrors(jobId) # Update error info for submission jobTracker.populateSubmissionErrorInfo(submissionId) # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") interfaces.errorDb.markFileComplete(jobId, self.filename) finally: # Ensure the file always closes reader.close() CloudLogger.logError( "VALIDATOR_INFO: ", "Completed L1 and SQL rule validations on jobID: " + str(jobId), "") return True
def test_schema_rules(self): """Test schema rules.""" lessRule = RuleType() lessRule.name = "LESS" greaterRule = RuleType() greaterRule.name = "GREATER" lengthRule = RuleType() lengthRule.name = "LENGTH" equalRule = RuleType() equalRule.name = "EQUAL" notRule = RuleType() notRule.name = "NOT EQUAL" setRule = RuleType() setRule.name = "IN_SET" sumRule = RuleType() sumRule.name = "SUM" sumToValueRule = RuleType() sumToValueRule.name = "SUM_TO_VALUE" schema = self.schema interfaces = self.interfaces rule1 = Rule() rule1.rule_type = equalRule rule1.file_column = schema["test1"] rule1.rule_text_1 = "hello" rule1.rule_timing_id = 1 rule2 = Rule() rule2.rule_type = notRule rule2.file_column = schema["test1"] rule2.rule_text_1 = "bye" rule2.rule_timing_id = 1 rule3 = Rule() rule3.rule_type = lengthRule rule3.file_column = schema["test1"] rule3.rule_text_1 = "6" rule3.rule_timing_id = 1 rule4 = Rule() rule4.rule_type = equalRule rule4.file_column = schema["test3"] rule4.rule_text_1 = "YES" rule4.rule_timing_id = 1 rule5 = Rule() rule5.rule_type = equalRule rule5.file_column = schema["test4"] rule5.rule_text_1 = "44" rule5.rule_timing_id = 1 rule6 = Rule() rule6.rule_type = lessRule rule6.file_column = schema["test4"] rule6.rule_text_1 = "45" rule6.rule_timing_id = 1 rule7 = Rule() rule7.rule_type = greaterRule rule7.file_column = schema["test2"] rule7.rule_text_1 = ".5" rule7.rule_timing_id = 1 rule8 = Rule() rule8.rule_type = setRule rule8.file_column = schema["test6"] rule8.rule_text_1 = "X, F, A" rule8.rule_timing_id = 1 rule9 = Rule() rule9.rule_type = sumRule rule9.file_column = schema["test2"] rule9.rule_text_1 = "test7" rule9.rule_text_2 = "test2,test4,test5" rule9.rule_timing_id = 1 rule10 = Rule() rule10.rule_type = sumToValueRule rule10.rule_text_1 = "46" rule10.rule_text_2 = "test2,test4,test5" rule10.rule_timing_id = 4 vvi = ValidatorValidationInterface() fileId = vvi.getFileId("award") vvi.addRule(None, "SUM_TO_VALUE", rule10.rule_text_1, rule10.rule_text_2, "Evaluates the sum of fields to a number",rule10.rule_timing_id,fileId = fileId) rules = [rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9] record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "44", "test5": "1", "test6": "X", "test7": "46" } self.assertTrue(Validator.validate( record, rules, schema, "award", self.interfaces)[0]) record = { "test1": "goodbye", "test2": ".4", "test3": "NO", "test4": "45", "test5": "1", "test6": "Q", "test7": "46.5" } self.assertFalse(Validator.validate( record, [rule3], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule4], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule5], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule6], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule7], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule8], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule9], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, rules, schema, "award", interfaces)[0])
def runValidation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() _exception_logger.info( 'VALIDATOR_INFO: Beginning runValidation on job_id: %s', job_id) submission_id = job.submission_id rowNumber = 1 fileType = job.file_type.name # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == fileType][0] # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = job.filename bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(get_report_path(job, 'error')) warningFileName = self.getFileName(get_report_path(job, 'warning')) # Create File Status object createFileIfNeeded(job_id, fileName) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(fileName) job.file_size = fileSize sess.commit() # Get fields for this file fields = sess.query(FileColumn). \ filter(FileColumn.file_id == FILE_TYPE_DICT[fileType]). \ all() for field in fields: sess.expunge(field) csvSchema = {row.name_short: row for row in fields} try: # Pull file and return info on whether it's using short or long col headers reader.open_file(regionName, bucketName, fileName, fields, bucketName, errorFileName, self.long_to_short_dict) # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.is_finished: rowNumber += 1 if rowNumber % 10 == 0: logger.info('loading row %s', rowNumber) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere, flex_cols) = self.readRecord(reader, writer, fileType, rowNumber, job, fields, error_list) if reduceRow: rowNumber -= 1 if rowErrorHere: errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, job, submission_id, passedValidations, writer, rowNumber, model, error_list) if flex_cols: self.write_to_flex(flex_cols, job_id, submission_id, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, job, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list): errorRows.append(rowNumber) _exception_logger.info( 'VALIDATOR_INFO: Loading complete on job_id: %s. ' 'Total rows added to staging: %s', job_id, rowNumber) if fileType in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(job, fileType, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata job.number_of_rows = rowNumber job.number_of_rows_valid = validRows sess.commit() error_list.writeAllRowErrors(job_id) # Update error info for submission populateSubmissionErrorInfo(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") markFileComplete(job_id, fileName) finally: # Ensure the file always closes reader.close() _exception_logger.info( 'VALIDATOR_INFO: Completed L1 and SQL rule validations on ' 'job_id: %s', job_id) return True
def test_schema_rules(self): """Test schema rules.""" lessRule = RuleType() lessRule.name = "LESS" greaterRule = RuleType() greaterRule.name = "GREATER" lengthRule = RuleType() lengthRule.name = "LENGTH" equalRule = RuleType() equalRule.name = "EQUAL" notRule = RuleType() notRule.name = "NOT EQUAL" setRule = RuleType() setRule.name = "IN_SET" sumRule = RuleType() sumRule.name = "SUM" sumToValueRule = MultiFieldRuleType() sumToValueRule.name = "SUM_TO_VALUE" schema = self.schema interfaces = self.interfaces rule1 = Rule() rule1.rule_type = equalRule rule1.file_column = schema["test1"] rule1.rule_text_1 = "hello" rule1.rule_timing_id = 1 rule2 = Rule() rule2.rule_type = notRule rule2.file_column = schema["test1"] rule2.rule_text_1 = "bye" rule2.rule_timing_id = 1 rule3 = Rule() rule3.rule_type = lengthRule rule3.file_column = schema["test1"] rule3.rule_text_1 = "6" rule3.rule_timing_id = 1 rule4 = Rule() rule4.rule_type = equalRule rule4.file_column = schema["test3"] rule4.rule_text_1 = "YES" rule4.rule_timing_id = 1 rule5 = Rule() rule5.rule_type = equalRule rule5.file_column = schema["test4"] rule5.rule_text_1 = "44" rule5.rule_timing_id = 1 rule6 = Rule() rule6.rule_type = lessRule rule6.file_column = schema["test4"] rule6.rule_text_1 = "45" rule6.rule_timing_id = 1 rule7 = Rule() rule7.rule_type = greaterRule rule7.file_column = schema["test2"] rule7.rule_text_1 = ".5" rule7.rule_timing_id = 1 rule8 = Rule() rule8.rule_type = setRule rule8.file_column = schema["test6"] rule8.rule_text_1 = "X, F, A" rule8.rule_timing_id = 1 rule9 = Rule() rule9.rule_type = sumRule rule9.file_column = schema["test2"] rule9.rule_text_1 = "test7" rule9.rule_text_2 = "test2,test4,test5" rule9.rule_timing_id = 1 rule10 = MultiFieldRule() rule10.rule_type = sumToValueRule rule10.rule_text_1 = "46" rule10.rule_text_2 = "test2,test4,test5" rule10.rule_timing_id = 1 vvi = ValidatorValidationInterface() fileId = vvi.getFileId("award") vvi.addMultiFieldRule(fileId, "SUM_TO_VALUE", rule10.rule_text_1, rule10.rule_text_2, "Evaluates the sum of fields to a number") rules = [rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9] record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "44", "test5": "1", "test6": "X", "test7": "46" } self.assertTrue(Validator.validate( record, rules, schema, "award", self.interfaces)[0]) record = { "test1": "goodbye", "test2": ".4", "test3": "NO", "test4": "45", "test5": "1", "test6": "Q", "test7": "46.5" } self.assertFalse(Validator.validate( record, [rule3], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule4], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule5], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule6], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule7], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule8], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule9], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, rules, schema, "award", interfaces)[0])