def writeErrors(self, failures, job, short_colnames, writer,
                    warning_writer, row_number, error_list):
        """ Write errors to error database

        Args:
            failures: List of errors to be written
            job: Current job
            short_colnames: Dict mapping short names to long names
            writer: CsvWriter object
            warning_writer: CsvWriter object
            row_number: Current row number
            error_list: instance of ErrorInterface to keep track of errors
        Returns:
            True if any fatal errors were found, False if only warnings are present
        """
        job_id = job.job_id
        fatal_error_found = False
        # For each failure, record it in error report and metadata
        for failure in failures:
            # map short column names back to long names
            if failure[0] in short_colnames:
                field_name = short_colnames[failure[0]]
            else:
                field_name = failure[0]
            error = failure[1]
            failed_value = failure[2]
            original_rule_label = failure[3]

            severityId = RULE_SEVERITY_DICT[failure[4]]
            try:
                # If error is an int, it's one of our prestored messages
                error_type = int(error)
                error_msg = ValidationError.getErrorMessage(error_type)
            except ValueError:
                # If not, treat it literally
                error_msg = error
            if failure[4] == "fatal":
                fatal_error_found = True
                writer.write([
                    field_name, error_msg,
                    str(row_number), failed_value, original_rule_label
                ])
            elif failure[4] == "warning":
                # write to warnings file
                warning_writer.write([
                    field_name, error_msg,
                    str(row_number), failed_value, original_rule_label
                ])
            error_list.recordRowError(job_id,
                                      job.filename,
                                      field_name,
                                      error,
                                      row_number,
                                      original_rule_label,
                                      severity_id=severityId)
        return fatal_error_found
Beispiel #2
0
    def writeErrors(self, failures, interfaces, jobId, shortColnames, writer,
                    warningWriter, rowNumber):
        """ Write errors to error database

        Args:
            failures: List of errors to be written
            interfaces: InterfaceHolder object
            jobId: ID of current job
            shortColnames: Dict mapping short names to long names
            writer: CsvWriter object
            rowNumber: Current row number
        Returns:
            True if any fatal errors were found, False if only warnings are present
        """
        fatalErrorFound = False
        errorInterface = interfaces.errorDb
        # For each failure, record it in error report and metadata
        for failure in failures:
            # map short column names back to long names
            if failure[0] in shortColnames:
                fieldName = shortColnames[failure[0]]
            else:
                fieldName = failure[0]
            error = failure[1]
            failedValue = failure[2]
            originalRuleLabel = failure[3]

            severityId = interfaces.validationDb.getRuleSeverityId(failure[4])
            try:
                # If error is an int, it's one of our prestored messages
                errorType = int(error)
                errorMsg = ValidationError.getErrorMessage(errorType)
            except ValueError:
                # If not, treat it literally
                errorMsg = error
            if failure[4] == "fatal":
                fatalErrorFound = True
                writer.write([
                    fieldName, errorMsg,
                    str(rowNumber), failedValue, originalRuleLabel
                ])
            elif failure[4] == "warning":
                # write to warnings file
                warningWriter.write([
                    fieldName, errorMsg,
                    str(rowNumber), failedValue, originalRuleLabel
                ])
            errorInterface.recordRowError(jobId,
                                          self.filename,
                                          fieldName,
                                          error,
                                          rowNumber,
                                          originalRuleLabel,
                                          severity_id=severityId)
        return fatalErrorFound
    def writeAllRowErrors(self, job_id):
        """ Writes all recorded errors to database

        Args:
            job_id: ID to write errors for
        """
        sess = GlobalDB.db().session
        for key in self.rowErrors.keys():
            errorDict = self.rowErrors[key]
            # Set info for this error
            thisJob = errorDict["jobId"]
            if int(job_id) != int(thisJob):
                # This row is for a different job, skip it
                continue
            field_name = errorDict["fieldName"]
            try:
                # If last part of key is an int, it's one of our prestored messages
                error_type = int(errorDict["errorType"])
            except ValueError:
                # For rule failures, it will hold the error message
                errorMsg = errorDict["errorType"]
                if "Field must be no longer than specified limit" in errorMsg:
                    ruleFailedId = ERROR_TYPE_DICT['length_error']
                else:
                    ruleFailedId = ERROR_TYPE_DICT['rule_failed']
                errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=field_name,
                                         error_type_id=ruleFailedId, rule_failed=errorMsg,
                                         occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"],
                                         original_rule_label=errorDict["originalRuleLabel"],
                                         file_type_id=errorDict["fileTypeId"],
                                         target_file_type_id=errorDict["targetFileId"],
                                         severity_id=errorDict["severity"])
            else:
                # This happens if cast to int was successful
                errorString = ValidationError.getErrorTypeString(error_type)
                errorId = ERROR_TYPE_DICT[errorString]
                # Create error metadata
                errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=field_name,
                                         error_type_id=errorId, occurrences=errorDict["numErrors"],
                                         first_row=errorDict["firstRow"],
                                         rule_failed=ValidationError.getErrorMessage(error_type),
                                         original_rule_label=errorDict["originalRuleLabel"],
                                         file_type_id=errorDict["fileTypeId"],
                                         target_file_type_id=errorDict["targetFileId"],
                                         severity_id=errorDict["severity"])

            sess.add(errorRow)

        # Commit the session to write all rows
        sess.commit()
        # Clear the dictionary
        self.rowErrors = {}
Beispiel #4
0
    def writeAllRowErrors(self, jobId):
        """ Writes all recorded errors to database

        Args:
            jobId: ID to write errors for

        Returns:
            True if successful
        """
        for key in self.rowErrors.keys():
            errorDict = self.rowErrors[key]
            # Set info for this error
            thisJob = errorDict["jobId"]
            if (int(jobId) != int(thisJob)):
                # This row is for a different job, skip it
                continue
            fieldName = errorDict["fieldName"]
            try:
                # If last part of key is an int, it's one of our prestored messages
                errorType = int(errorDict["errorType"])
            except ValueError:
                # For rule failures, it will hold the error message
                errorMsg = errorDict["errorType"]
                ruleFailedId = self.getTypeId("rule_failed")
                errorRow = ErrorData(job_id=thisJob,
                                     filename=errorDict["filename"],
                                     field_name=fieldName,
                                     error_type_id=ruleFailedId,
                                     rule_failed=errorMsg,
                                     occurrences=errorDict["numErrors"],
                                     first_row=errorDict["firstRow"])
            else:
                # This happens if cast to int was successful
                errorString = ValidationError.getErrorTypeString(errorType)
                errorId = self.getTypeId(errorString)
                # Create error data
                errorRow = ErrorData(
                    job_id=thisJob,
                    filename=errorDict["filename"],
                    field_name=fieldName,
                    error_type_id=errorId,
                    occurrences=errorDict["numErrors"],
                    first_row=errorDict["firstRow"],
                    rule_failed=ValidationError.getErrorMessage(errorType))

            self.session.add(errorRow)

        # Commit the session to write all rows
        self.session.commit()
        # Clear the dictionary
        self.rowErrors = {}
    def writeAllRowErrors(self, jobId):
        """ Writes all recorded errors to database

        Args:
            jobId: ID to write errors for

        Returns:
            True if successful
        """
        for key in self.rowErrors.keys():
            errorDict = self.rowErrors[key]
            # Set info for this error
            thisJob = errorDict["jobId"]
            if(int(jobId) != int(thisJob)):
                # This row is for a different job, skip it
                continue
            fieldName = errorDict["fieldName"]
            try:
                # If last part of key is an int, it's one of our prestored messages
                errorType = int(errorDict["errorType"])
            except ValueError:
                # For rule failures, it will hold the error message
                errorMsg = errorDict["errorType"]
                ruleFailedId = self.getTypeId("rule_failed")
                errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"])
            else:
                # This happens if cast to int was successful
                errorString = ValidationError.getErrorTypeString(errorType)
                errorId = self.getTypeId(errorString)
                # Create error metadata
                errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(errorType))

            self.session.add(errorRow)

        # Commit the session to write all rows
        self.session.commit()
        # Clear the dictionary
        self.rowErrors = {}
    def runValidation(self, jobId, interfaces):
        """ Run validations for specified job
        Args:
            jobId: Job to be validated
            jobTracker: Interface for job tracker
        Returns:
            True if successful
        """
        jobTracker = interfaces.jobDb
        rowNumber = 1
        fileType = jobTracker.getFileType(jobId)
        # If local, make the error report directory
        if(self.isLocal and not os.path.exists(self.directory)):
            os.makedirs(self.directory)
        # Get bucket name and file name
        fileName = jobTracker.getFileName(jobId)
        self.filename = fileName
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']

        errorFileName = self.getFileName(jobTracker.getReportPath(jobId))

        # Create File Status object
        interfaces.errorDb.createFileIfNeeded(jobId,fileName)

        validationDB = interfaces.validationDb
        fieldList = validationDB.getFieldsByFileList(fileType)
        csvSchema  = validationDB.getFieldsByFile(fileType)
        rules = validationDB.getRulesByFile(fileType)

        reader = self.getReader()

        # Get file size and write to jobs table
        if(CONFIG_BROKER["use_aws"]):
            fileSize =  s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId))
        else:
            fileSize = os.path.getsize(jobTracker.getFileName(jobId))
        jobTracker.setFileSizeById(jobId, fileSize)


        try:
            # Pull file
            reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName)
            # Create staging table

            tableName = interfaces.stagingDb.getTableName(jobId)
            # Create staging table
            tableObject = StagingTable(interfaces)
            tableObject.createTable(fileType,fileName,jobId,tableName)
            errorInterface = interfaces.errorDb

            # While not done, pull one row and put it into staging if it passes
            # the Validator
            with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer:
                while(not reader.isFinished):
                    rowNumber += 1
                    #if (rowNumber % 1000) == 0:
                    #    print("Validating row " + str(rowNumber))
                    try :
                        record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB)
                        record["row"] = rowNumber
                        if(reader.isFinished and len(record) < 2):
                            # This is the last line and is empty, don't record an error
                            rowNumber -= 1 # Don't count this row
                            break
                    except ResponseException as e:
                        if reader.isFinished and reader.extraLine:
                            #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks
                            # Don't count last row if empty
                            rowNumber -= 1
                        else:
                            writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""])
                            errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                        continue
                    valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces)
                    if(valid) :
                        try:
                            tableObject.insert(record,fileType)
                        except ResponseException as e:
                            # Write failed, move to next record
                            writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""])
                            errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                            continue

                    else:
                        # For each failure, record it in error report and metadata
                        if failures:
                            errorInterface.setRowErrorsPresent(jobId, True)
                        for failure in failures:
                            fieldName = failure[0]
                            error = failure[1]
                            failedValue = failure[2]
                            try:
                                # If error is an int, it's one of our prestored messages
                                errorType = int(error)
                                errorMsg = ValidationError.getErrorMessage(errorType)
                            except ValueError:
                                # If not, treat it literally
                                errorMsg = error
                            writer.write([fieldName,errorMsg,str(rowNumber),failedValue])
                            errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber)
                # Write unfinished batch
                writer.finishBatch()

            # Write number of rows to job table
            jobTracker.setNumberOfRowsById(jobId,rowNumber)
            # Write leftover records
            tableObject.endBatch()
            # Mark validation as finished in job tracker
            jobTracker.markJobStatus(jobId,"finished")
            errorInterface.writeAllRowErrors(jobId)
        finally:
            #ensure the file always closes
            reader.close()
        return True
    def runValidation(self, jobId, interfaces):
        """ Run validations for specified job
        Args:
            jobId: Job to be validated
            jobTracker: Interface for job tracker
        Returns:
            True if successful
        """
        jobTracker = interfaces.jobDb
        rowNumber = 1
        fileType = jobTracker.getFileType(jobId)
        # If local, make the error report directory
        if (self.isLocal and not os.path.exists(self.directory)):
            os.makedirs(self.directory)
        # Get bucket name and file name
        fileName = jobTracker.getFileName(jobId)
        self.filename = fileName
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']

        errorFileName = self.getFileName(jobTracker.getReportPath(jobId))

        # Create File Status object
        interfaces.errorDb.createFileIfNeeded(jobId, fileName)

        validationDB = interfaces.validationDb
        fieldList = validationDB.getFieldsByFileList(fileType)
        csvSchema = validationDB.getFieldsByFile(fileType)
        rules = validationDB.getRulesByFile(fileType)

        reader = self.getReader()

        # Get file size and write to jobs table
        if (CONFIG_BROKER["use_aws"]):
            fileSize = s3UrlHandler.getFileSize(
                "errors/" + jobTracker.getReportPath(jobId))
        else:
            fileSize = os.path.getsize(jobTracker.getFileName(jobId))
        jobTracker.setFileSizeById(jobId, fileSize)

        try:
            # Pull file
            reader.openFile(regionName, bucketName, fileName, fieldList,
                            bucketName, errorFileName)
            # Create staging table

            tableName = interfaces.stagingDb.getTableName(jobId)
            # Create staging table
            tableObject = StagingTable(interfaces)
            tableObject.createTable(fileType, fileName, jobId, tableName)
            errorInterface = interfaces.errorDb

            # While not done, pull one row and put it into staging if it passes
            # the Validator
            with self.getWriter(regionName, bucketName, errorFileName,
                                self.reportHeaders) as writer:
                while (not reader.isFinished):
                    rowNumber += 1
                    #if (rowNumber % 1000) == 0:
                    #    print("Validating row " + str(rowNumber))
                    try:
                        record = FieldCleaner.cleanRow(reader.getNextRecord(),
                                                       fileType, validationDB)
                        record["row"] = rowNumber
                        if (reader.isFinished and len(record) < 2):
                            # This is the last line and is empty, don't record an error
                            rowNumber -= 1  # Don't count this row
                            break
                    except ResponseException as e:
                        if reader.isFinished and reader.extraLine:
                            #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks
                            # Don't count last row if empty
                            rowNumber -= 1
                        else:
                            writer.write([
                                "Formatting Error",
                                ValidationError.readErrorMsg,
                                str(rowNumber), ""
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, "Formatting Error",
                                ValidationError.readError, rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                        continue
                    valid, failures = Validator.validate(
                        record, rules, csvSchema, fileType, interfaces)
                    if (valid):
                        try:
                            tableObject.insert(record, fileType)
                        except ResponseException as e:
                            # Write failed, move to next record
                            writer.write([
                                "Formatting Error",
                                ValidationError.writeErrorMsg,
                                str(rowNumber), ""
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, "Formatting Error",
                                ValidationError.writeError, rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                            continue

                    else:
                        # For each failure, record it in error report and metadata
                        if failures:
                            errorInterface.setRowErrorsPresent(jobId, True)
                        for failure in failures:
                            fieldName = failure[0]
                            error = failure[1]
                            failedValue = failure[2]
                            try:
                                # If error is an int, it's one of our prestored messages
                                errorType = int(error)
                                errorMsg = ValidationError.getErrorMessage(
                                    errorType)
                            except ValueError:
                                # If not, treat it literally
                                errorMsg = error
                            writer.write([
                                fieldName, errorMsg,
                                str(rowNumber), failedValue
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, fieldName, error,
                                rowNumber)
                # Write unfinished batch
                writer.finishBatch()

            # Write number of rows to job table
            jobTracker.setNumberOfRowsById(jobId, rowNumber)
            # Write leftover records
            tableObject.endBatch()
            # Mark validation as finished in job tracker
            jobTracker.markJobStatus(jobId, "finished")
            errorInterface.writeAllRowErrors(jobId)
        finally:
            #ensure the file always closes
            reader.close()
        return True
Beispiel #8
0
    def runSqlValidations(self, interfaces, jobId, fileType, shortColnames,
                          writer, warningWriter, rowNumber):
        """ Run all SQL rules for this file type

        Args:
            interfaces: InterfaceHolder object
            jobId: ID of current job
            fileType: Type of file for current job
            shortColnames: Dict mapping short field names to long
            writer: CsvWriter object
            waringWriter: CsvWriter for warnings
            rowNumber: Current row number

        Returns:
            a list of the row numbers that failed one of the sql-based validations
        """
        errorInterface = interfaces.errorDb
        errorRows = []
        sqlFailures = Validator.validateFileBySql(
            interfaces.jobDb.getSubmissionId(jobId), fileType, interfaces)
        for failure in sqlFailures:
            # convert shorter, machine friendly column names used in the
            # SQL validation queries back to their long names
            if failure[0] in shortColnames:
                fieldName = shortColnames[failure[0]]
            else:
                fieldName = failure[0]
            error = failure[1]
            failedValue = failure[2]
            row = failure[3]
            original_label = failure[4]
            fileTypeId = failure[5]
            targetFileId = failure[6]
            severityId = failure[7]
            if severityId == interfaces.validationDb.getRuleSeverityId(
                    "fatal"):
                errorRows.append(row)
            try:
                # If error is an int, it's one of our prestored messages
                errorType = int(error)
                errorMsg = ValidationError.getErrorMessage(errorType)
            except ValueError:
                # If not, treat it literally
                errorMsg = error
            if severityId == interfaces.validationDb.getRuleSeverityId(
                    "fatal"):
                writer.write([
                    fieldName, errorMsg,
                    str(row), failedValue, original_label
                ])
            elif severityId == interfaces.validationDb.getRuleSeverityId(
                    "warning"):
                # write to warnings file
                warningWriter.write([
                    fieldName, errorMsg,
                    str(row), failedValue, original_label
                ])
            errorInterface.recordRowError(jobId,
                                          self.filename,
                                          fieldName,
                                          error,
                                          rowNumber,
                                          original_label,
                                          file_type_id=fileTypeId,
                                          target_file_id=targetFileId,
                                          severity_id=severityId)
        return errorRows
    def runSqlValidations(self, job, file_type, short_colnames, writer,
                          warning_writer, row_number, error_list):
        """ Run all SQL rules for this file type

        Args:
            job: Current job
            file_type: Type of file for current job
            short_colnames: Dict mapping short field names to long
            writer: CsvWriter object
            warning_writer: CsvWriter for warnings
            row_number: Current row number
            error_list: instance of ErrorInterface to keep track of errors

        Returns:
            a list of the row numbers that failed one of the sql-based validations
        """
        sess = GlobalDB.db().session
        job_id = job.job_id
        error_rows = []
        sql_failures = Validator.validateFileBySql(job.submission_id,
                                                   file_type,
                                                   self.short_to_long_dict)
        for failure in sql_failures:
            # convert shorter, machine friendly column names used in the
            # SQL validation queries back to their long names
            if failure[0] in short_colnames:
                field_name = short_colnames[failure[0]]
            else:
                field_name = failure[0]
            error = failure[1]
            failed_value = failure[2]
            row = failure[3]
            original_label = failure[4]
            file_type_id = failure[5]
            target_file_id = failure[6]
            severity_id = failure[7]
            if severity_id == RULE_SEVERITY_DICT['fatal']:
                error_rows.append(row)
            try:
                # If error is an int, it's one of our prestored messages
                error_type = int(error)
                error_msg = ValidationError.getErrorMessage(error_type)
            except ValueError:
                # If not, treat it literally
                error_msg = error
            if severity_id == RULE_SEVERITY_DICT['fatal']:
                writer.write([
                    field_name, error_msg,
                    str(row), failed_value, original_label
                ])
            elif severity_id == RULE_SEVERITY_DICT['warning']:
                # write to warnings file
                warning_writer.write([
                    field_name, error_msg,
                    str(row), failed_value, original_label
                ])
            error_list.recordRowError(job_id,
                                      job.filename,
                                      field_name,
                                      error,
                                      row_number,
                                      original_label,
                                      file_type_id=file_type_id,
                                      target_file_id=target_file_id,
                                      severity_id=severity_id)
        return error_rows