def write_all_row_errors(error_list, job_id): """ Writes all recorded errors to database Args: error_list: dict keeping track of error metadata to be updated job_id: ID to write errors for """ sess = GlobalDB.db().session for key in error_list.keys(): error_dict = error_list[key] # Set info for this error this_job = error_dict["jobId"] if int(job_id) != int(this_job): # This row is for a different job, skip it continue field_name = error_dict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages error_type = int(error_dict["errorType"]) except ValueError: # For rule failures, it will hold the error message error_msg = error_dict["errorType"] if "Field must be no longer than specified limit" in error_msg: rule_failed_id = ERROR_TYPE_DICT['length_error'] else: rule_failed_id = ERROR_TYPE_DICT['rule_failed'] error_row = ErrorMetadata( job_id=this_job, filename=error_dict["filename"], field_name=field_name, error_type_id=rule_failed_id, rule_failed=error_msg, occurrences=error_dict["numErrors"], first_row=error_dict["firstRow"], original_rule_label=error_dict["originalRuleLabel"], file_type_id=error_dict["fileTypeId"], target_file_type_id=error_dict["targetFileId"], severity_id=error_dict["severity"]) else: # This happens if cast to int was successful error_string = ValidationError.get_error_type_string(error_type) error_id = ERROR_TYPE_DICT[error_string] # Create error metadata error_row = ErrorMetadata( job_id=this_job, filename=error_dict["filename"], field_name=field_name, error_type_id=error_id, occurrences=error_dict["numErrors"], first_row=error_dict["firstRow"], rule_failed=ValidationError.get_error_message(error_type), original_rule_label=error_dict["originalRuleLabel"], file_type_id=error_dict["fileTypeId"], target_file_type_id=error_dict["targetFileId"], severity_id=error_dict["severity"]) sess.add(error_row) # Commit the session to write all rows sess.commit()
def write_all_row_errors(self, job_id): """ Writes all recorded errors to database Args: job_id: ID to write errors for """ sess = GlobalDB.db().session for key in self.rowErrors.keys(): error_dict = self.rowErrors[key] # Set info for this error this_job = error_dict["jobId"] if int(job_id) != int(this_job): # This row is for a different job, skip it continue field_name = error_dict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages error_type = int(error_dict["errorType"]) except ValueError: # For rule failures, it will hold the error message error_msg = error_dict["errorType"] if "Field must be no longer than specified limit" in error_msg: rule_failed_id = ERROR_TYPE_DICT['length_error'] else: rule_failed_id = ERROR_TYPE_DICT['rule_failed'] error_row = ErrorMetadata(job_id=this_job, filename=error_dict["filename"], field_name=field_name, error_type_id=rule_failed_id, rule_failed=error_msg, occurrences=error_dict["numErrors"], first_row=error_dict["firstRow"], original_rule_label=error_dict["originalRuleLabel"], file_type_id=error_dict["fileTypeId"], target_file_type_id=error_dict["targetFileId"], severity_id=error_dict["severity"]) else: # This happens if cast to int was successful error_string = ValidationError.get_error_type_string(error_type) error_id = ERROR_TYPE_DICT[error_string] # Create error metadata error_row = ErrorMetadata(job_id=this_job, filename=error_dict["filename"], field_name=field_name, error_type_id=error_id, occurrences=error_dict["numErrors"], first_row=error_dict["firstRow"], rule_failed=ValidationError.get_error_message(error_type), original_rule_label=error_dict["originalRuleLabel"], file_type_id=error_dict["fileTypeId"], target_file_type_id=error_dict["targetFileId"], severity_id=error_dict["severity"]) sess.add(error_row) # Commit the session to write all rows sess.commit() # Clear the dictionary self.rowErrors = {}
def writeAllRowErrors(self, jobId): """ Writes all recorded errors to database Args: jobId: ID to write errors for Returns: True if successful """ for key in self.rowErrors.keys(): errorDict = self.rowErrors[key] # Set info for this error thisJob = errorDict["jobId"] if (int(jobId) != int(thisJob)): # This row is for a different job, skip it continue fieldName = errorDict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages errorType = int(errorDict["errorType"]) except ValueError: # For rule failures, it will hold the error message errorMsg = errorDict["errorType"] ruleFailedId = self.getTypeId("rule_failed") errorRow = ErrorData(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"]) else: # This happens if cast to int was successful errorString = ValidationError.getErrorTypeString(errorType) errorId = self.getTypeId(errorString) # Create error data errorRow = ErrorData( job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(errorType)) self.session.add(errorRow) # Commit the session to write all rows self.session.commit() # Clear the dictionary self.rowErrors = {}
def write_file_error(job_id, filename, error_type, extra_info=None): """ Write a file-level error to the file table Args: job_id: ID of job in job tracker filename: name of error report in S3 error_type: type of error, value will be mapped to ValidationError class extra_info: list of extra information to be included in file """ sess = GlobalDB.db().session try: int(job_id) except Exception: logger.error({ 'message': 'Bad job_id: {}'.format(job_id), 'message_type': 'CoreError', 'job_id': job_id, 'function': 'write_file_error' }) raise ValueError('Bad job_id: {}'.format(job_id)) # Get File object for this job ID or create it if it doesn't exist file_rec = create_file_if_needed(job_id, filename) # Mark error type and add header info if present file_rec.file_status_id = FILE_STATUS_DICT[ ValidationError.get_error_type_string(error_type)] if extra_info is not None: if 'missing_headers' in extra_info: file_rec.headers_missing = extra_info['missing_headers'] if 'duplicated_headers' in extra_info: file_rec.headers_duplicated = extra_info['duplicated_headers'] sess.add(file_rec) sess.commit()
def writeFileError(job_id, filename, error_type, extra_info=None): """ Write a file-level error to the file table Args: job_id: ID of job in job tracker filename: name of error report in S3 error_type: type of error, value will be mapped to ValidationError class extra_info: list of extra information to be included in file """ sess = GlobalDB.db().session try: int(job_id) except: raise ValueError("".join(["Bad jobId: ", str(job_id)])) # Get File object for this job ID or create it if it doesn't exist fileRec = createFileIfNeeded(job_id, filename) # Mark error type and add header info if present fileRec.file_status_id = FILE_STATUS_DICT[ValidationError.getErrorTypeString(error_type)] if extra_info is not None: if "missing_headers" in extra_info: fileRec.headers_missing = extra_info["missing_headers"] if "duplicated_headers" in extra_info: fileRec.headers_duplicated = extra_info["duplicated_headers"] sess.add(fileRec) sess.commit()
def writeFileError(self, jobId, filename, errorType, extraInfo = None): """ Write a file-level error to the file table Args: jobId: ID of job in job tracker filename: name of error report in S3 errorType: type of error, value will be mapped to ValidationError class Returns: True if successful """ try: int(jobId) except: raise ValueError("".join(["Bad jobId: ", str(jobId)])) # Get File object for this job ID or create it if it doesn't exist fileRec = self.createFileIfNeeded(jobId, filename) # Mark error type and add header info if present fileRec.file_status_id = self.getFileStatusId( ValidationError.getErrorTypeString(errorType)) if extraInfo is not None: if "missing_headers" in extraInfo: fileRec.headers_missing = extraInfo["missing_headers"] if "duplicated_headers" in extraInfo: fileRec.headers_duplicated = extraInfo["duplicated_headers"] self.session.add(fileRec) self.session.commit() return True
def write_file_error(job_id, filename, error_type, extra_info=None): """ Write a file-level error to the file table Args: job_id: ID of job in job tracker filename: name of error report in S3 error_type: type of error, value will be mapped to ValidationError class extra_info: list of extra information to be included in file """ sess = GlobalDB.db().session try: int(job_id) except: raise ValueError("".join(["Bad jobId: ", str(job_id)])) # Get File object for this job ID or create it if it doesn't exist file_rec = create_file_if_needed(job_id, filename) # Mark error type and add header info if present file_rec.file_status_id = FILE_STATUS_DICT[ValidationError.get_error_type_string(error_type)] if extra_info is not None: if "missing_headers" in extra_info: file_rec.headers_missing = extra_info["missing_headers"] if "duplicated_headers" in extra_info: file_rec.headers_duplicated = extra_info["duplicated_headers"] sess.add(file_rec) sess.commit()
def writeFileError(self, jobId, filename, errorType, extraInfo=None): """ Write a file-level error to the file status table Args: jobId: ID of job in job tracker filename: name of error report in S3 errorType: type of error, value will be mapped to ValidationError class Returns: True if successful """ try: int(jobId) except: raise ValueError("".join(["Bad jobId: ", str(jobId)])) # Get File Status for this job ID or create it if it doesn't exist fileStatus = self.createFileStatusIfNeeded(jobId, filename) # Mark error type and add header info if present fileStatus.status_id = self.getStatusId( ValidationError.getErrorTypeString(errorType)) if extraInfo is not None: if "missing_headers" in extraInfo: fileStatus.headers_missing = extraInfo["missing_headers"] if "duplicated_headers" in extraInfo: fileStatus.headers_duplicated = extraInfo["duplicated_headers"] self.session.add(fileStatus) self.session.commit() return True
def run_sql_validations(self, job, file_type, short_colnames, writer, warning_writer, row_number, error_list): """ Run all SQL rules for this file type Args: job: Current job file_type: Type of file for current job short_colnames: Dict mapping short field names to long writer: CsvWriter object warning_writer: CsvWriter for warnings row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: a list of the row numbers that failed one of the sql-based validations """ job_id = job.job_id error_rows = [] sql_failures = validate_file_by_sql( job, file_type, self.short_to_long_dict[job.file_type_id]) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure.field_name in short_colnames: field_name = short_colnames[failure.field_name] else: field_name = failure.field_name if failure.severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(failure.row) try: # If error is an int, it's one of our prestored messages error_type = int(failure.error) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.error if failure.severity_id == RULE_SEVERITY_DICT['fatal']: writer.writerow([ field_name, error_msg, str(failure.row), failure.failed_value, failure.original_label ]) elif failure.severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.writerow([ field_name, error_msg, str(failure.row), failure.failed_value, failure.original_label ]) error_list.record_row_error(job_id, job.filename, field_name, failure.error, row_number, failure.original_label, failure.file_type_id, failure.target_file_id, failure.severity_id) return error_rows
def writeErrors(self, failures, job, short_colnames, writer, warning_writer, row_number, error_list): """ Write errors to error database Args: failures: List of errors to be written job: Current job short_colnames: Dict mapping short names to long names writer: CsvWriter object warning_writer: CsvWriter object row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: True if any fatal errors were found, False if only warnings are present """ job_id = job.job_id fatal_error_found = False # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure[0] in short_colnames: field_name = short_colnames[failure[0]] else: field_name = failure[0] error = failure[1] failed_value = failure[2] original_rule_label = failure[3] severityId = RULE_SEVERITY_DICT[failure[4]] try: # If error is an int, it's one of our prestored messages error_type = int(error) error_msg = ValidationError.getErrorMessage(error_type) except ValueError: # If not, treat it literally error_msg = error if failure[4] == "fatal": fatal_error_found = True writer.write([ field_name, error_msg, str(row_number), failed_value, original_rule_label ]) elif failure[4] == "warning": # write to warnings file warning_writer.write([ field_name, error_msg, str(row_number), failed_value, original_rule_label ]) error_list.recordRowError(job_id, job.filename, field_name, error, row_number, original_rule_label, severity_id=severityId) return fatal_error_found
def writeErrors(self, failures, interfaces, jobId, shortColnames, writer, warningWriter, rowNumber): """ Write errors to error database Args: failures: List of errors to be written interfaces: InterfaceHolder object jobId: ID of current job shortColnames: Dict mapping short names to long names writer: CsvWriter object rowNumber: Current row number Returns: True if any fatal errors were found, False if only warnings are present """ fatalErrorFound = False errorInterface = interfaces.errorDb # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure[0] in shortColnames: fieldName = shortColnames[failure[0]] else: fieldName = failure[0] error = failure[1] failedValue = failure[2] originalRuleLabel = failure[3] severityId = interfaces.validationDb.getRuleSeverityId(failure[4]) try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error if failure[4] == "fatal": fatalErrorFound = True writer.write([ fieldName, errorMsg, str(rowNumber), failedValue, originalRuleLabel ]) elif failure[4] == "warning": # write to warnings file warningWriter.write([ fieldName, errorMsg, str(rowNumber), failedValue, originalRuleLabel ]) errorInterface.recordRowError(jobId, self.filename, fieldName, error, rowNumber, originalRuleLabel, severity_id=severityId) return fatalErrorFound
def run_sql_validations(self, job, file_type, short_colnames, writer, warning_writer, row_number, error_list): """ Run all SQL rules for this file type Args: job: Current job file_type: Type of file for current job short_colnames: Dict mapping short field names to long writer: CsvWriter object warning_writer: CsvWriter for warnings row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: a list of the row numbers that failed one of the sql-based validations """ job_id = job.job_id error_rows = [] sql_failures = validate_file_by_sql(job, file_type, self.short_to_long_dict) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure.field_name in short_colnames: field_name = short_colnames[failure.field_name] else: field_name = failure.field_name if failure.severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(failure.row) try: # If error is an int, it's one of our prestored messages error_type = int(failure.error) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.error if failure.severity_id == RULE_SEVERITY_DICT['fatal']: writer.write([field_name, error_msg, str(failure.row), failure.failed_value, failure.original_label]) elif failure.severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.write([field_name, error_msg, str(failure.row), failure.failed_value, failure.original_label]) error_list.record_row_error(job_id, job.filename, field_name, failure.error, row_number, failure.original_label, failure.file_type_id, failure.target_file_id, failure.severity_id) return error_rows
def run_sql_validations(self, short_colnames, writer, warning_writer): """ Run all SQL rules for this file type Args: short_colnames: Dict mapping short field names to long writer: CsvWriter object for error file warning_writer: CsvWriter object for warning file Returns: a list of the row numbers that failed one of the sql-based validations """ error_rows = [] sql_failures = validate_file_by_sql(self.job, self.file_type.name, self.short_to_long_dict[self.file_type.file_type_id]) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure.field_name in short_colnames: field_name = short_colnames[failure.field_name] else: field_name = failure.field_name if failure.severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(failure.row) try: # If error is an int, it's one of our prestored messages error_type = int(failure.error) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.error if failure.severity_id == RULE_SEVERITY_DICT['fatal']: writer.writerow([failure.unique_id, field_name, error_msg, failure.failed_value, failure.expected_value, failure.difference, failure.flex_fields, str(failure.row), failure.original_label]) elif failure.severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.writerow([failure.unique_id, field_name, error_msg, failure.failed_value, failure.expected_value, failure.difference, failure.flex_fields, str(failure.row), failure.original_label]) # labeled errors self.error_list.record_row_error(self.job.job_id, self.file_name, field_name, failure.error, self.total_rows, failure.original_label, failure.file_type_id, failure.target_file_id, failure.severity_id) return error_rows
def write_errors(failures, job, short_colnames, writer, warning_writer, row_number, error_list): """ Write errors to error database Args: failures: List of Failures to be written job: Current job short_colnames: Dict mapping short names to long names writer: CsvWriter object warning_writer: CsvWriter object row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: True if any fatal errors were found, False if only warnings are present """ fatal_error_found = False # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure.field in short_colnames: field_name = short_colnames[failure.field] else: field_name = failure.field severity_id = RULE_SEVERITY_DICT[failure.severity] try: # If error is an int, it's one of our prestored messages error_type = int(failure.description) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.description if failure.severity == 'fatal': fatal_error_found = True writer.write([field_name, error_msg, str(row_number), failure.value, failure.label]) elif failure.severity == 'warning': # write to warnings file warning_writer.write([field_name, error_msg, str(row_number), failure.value, failure.label]) error_list.record_row_error(job.job_id, job.filename, field_name, failure.description, row_number, failure.label, severity_id=severity_id) return fatal_error_found
def writeAllRowErrors(self, jobId): """ Writes all recorded errors to database Args: jobId: ID to write errors for Returns: True if successful """ for key in self.rowErrors.keys(): errorDict = self.rowErrors[key] # Set info for this error thisJob = errorDict["jobId"] if(int(jobId) != int(thisJob)): # This row is for a different job, skip it continue fieldName = errorDict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages errorType = int(errorDict["errorType"]) except ValueError: # For rule failures, it will hold the error message errorMsg = errorDict["errorType"] ruleFailedId = self.getTypeId("rule_failed") errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"]) else: # This happens if cast to int was successful errorString = ValidationError.getErrorTypeString(errorType) errorId = self.getTypeId(errorString) # Create error metadata errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(errorType)) self.session.add(errorRow) # Commit the session to write all rows self.session.commit() # Clear the dictionary self.rowErrors = {}
def write_errors(failures, job, short_colnames, writer, warning_writer, row_number, error_list, flex_cols): """ Write errors to error database Args: failures: List of Failures to be written job: Current job short_colnames: Dict mapping short names to long names writer: CsvWriter object warning_writer: CsvWriter object row_number: Current row number error_list: instance of ErrorInterface to keep track of errors flex_cols: all flex columns for this row Returns: True if any fatal errors were found, False if only warnings are present """ fatal_error_found = False # prepare flex cols for all the errors for this row flex_col_headers = [] flex_col_cells = [] if flex_cols: for flex_col in flex_cols: flex_col_headers.append(flex_col.header) flex_val = flex_col.cell if flex_col.cell else "" flex_col_cells.append(flex_col.header + ": " + flex_val) # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure.field in short_colnames: field_name = short_colnames[failure.field] else: field_name = failure.field severity_id = RULE_SEVERITY_DICT[failure.severity] try: # If error is an int, it's one of our prestored messages error_type = int(failure.description) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.description # get flex fields field_names = [field_name] flex_list = [] # only add the value if there's something to add, otherwise our join will look bad if failure.value: flex_list = [field_name + ": " + failure.value] # append whatever list we made of flex columns to our existing field names and content list field_names.extend(flex_col_headers) flex_list.extend(flex_col_cells) # join the field names and flex column values so we have a list instead of a single value combined_field_names = ", ".join(field_names) fail_value = ", ".join(flex_list) if failure.severity == 'fatal': fatal_error_found = True writer.writerow([ combined_field_names, error_msg, str(row_number), fail_value, failure.label ]) elif failure.severity == 'warning': # write to warnings file warning_writer.writerow([ combined_field_names, error_msg, str(row_number), fail_value, failure.label ]) error_list.record_row_error(job.job_id, job.filename, combined_field_names, failure.description, row_number, failure.label, severity_id=severity_id) return fatal_error_found
def runSqlValidations(self, job, file_type, short_colnames, writer, warning_writer, row_number, error_list): """ Run all SQL rules for this file type Args: job: Current job file_type: Type of file for current job short_colnames: Dict mapping short field names to long writer: CsvWriter object warning_writer: CsvWriter for warnings row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: a list of the row numbers that failed one of the sql-based validations """ sess = GlobalDB.db().session job_id = job.job_id error_rows = [] sql_failures = Validator.validateFileBySql(job.submission_id, file_type, self.short_to_long_dict) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in short_colnames: field_name = short_colnames[failure[0]] else: field_name = failure[0] error = failure[1] failed_value = failure[2] row = failure[3] original_label = failure[4] file_type_id = failure[5] target_file_id = failure[6] severity_id = failure[7] if severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(row) try: # If error is an int, it's one of our prestored messages error_type = int(error) error_msg = ValidationError.getErrorMessage(error_type) except ValueError: # If not, treat it literally error_msg = error if severity_id == RULE_SEVERITY_DICT['fatal']: writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) elif severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) error_list.recordRowError(job_id, job.filename, field_name, error, row_number, original_label, file_type_id=file_type_id, target_file_id=target_file_id, severity_id=severity_id) return error_rows
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runSqlValidations(self, interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber): """ Run all SQL rules for this file type Args: interfaces: InterfaceHolder object jobId: ID of current job fileType: Type of file for current job shortColnames: Dict mapping short field names to long writer: CsvWriter object waringWriter: CsvWriter for warnings rowNumber: Current row number Returns: a list of the row numbers that failed one of the sql-based validations """ errorInterface = interfaces.errorDb errorRows = [] sqlFailures = Validator.validateFileBySql( interfaces.jobDb.getSubmissionId(jobId), fileType, interfaces) for failure in sqlFailures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in shortColnames: fieldName = shortColnames[failure[0]] else: fieldName = failure[0] error = failure[1] failedValue = failure[2] row = failure[3] original_label = failure[4] fileTypeId = failure[5] targetFileId = failure[6] severityId = failure[7] if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): errorRows.append(row) try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): writer.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) elif severityId == interfaces.validationDb.getRuleSeverityId( "warning"): # write to warnings file warningWriter.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) errorInterface.recordRowError(jobId, self.filename, fieldName, error, rowNumber, original_label, file_type_id=fileTypeId, target_file_id=targetFileId, severity_id=severityId) return errorRows