def validate_threaded(): """Start the validation process on a new thread.""" @copy_current_request_context def ThreadedFunction(arg): """The new thread.""" threadedManager = ValidationManager(local, error_report_path) threadedManager.threadedValidateJob(arg) try: interfaces = InterfaceHolder() jobTracker = interfaces.jobDb except ResponseException as e: open("errorLog","a").write(str(e) + "\n") return JsonResponse.error(e,e.status,table = "cannot connect to job database") except Exception as e: open("errorLog","a").write(str(e) + "\n") exc = ResponseException(str(e),StatusCode.INTERNAL_ERROR,type(e)) return JsonResponse.error(exc,exc.status,table= "cannot connect to job database") jobId = None manager = ValidationManager(local, error_report_path) try: jobId = manager.getJobID(request) except ResponseException as e: manager.markJob(jobId,jobTracker,"invalid",interfaces.errorDb,manager.filename) CloudLogger.logError(str(e),e,traceback.extract_tb(sys.exc_info()[2])) return JsonResponse.error(e,e.status,table ="") except Exception as e: exc = ResponseException(str(e),StatusCode.CLIENT_ERROR,type(e)) manager.markJob(jobId,jobTracker,"invalid",interfaces.errorDb,manager.filename) CloudLogger.logError(str(e),exc,traceback.extract_tb(sys.exc_info()[2])) return JsonResponse.error(exc,exc.status,table="") try: manager.testJobID(jobId,interfaces) except ResponseException as e: open("errorLog","a").write(str(e) + "\n") # Job is not ready to run according to job tracker, do not change status of job in job tracker interfaces.errorDb.writeFileError(jobId,manager.filename,ValidationError.jobError) return JsonResponse.error(e,e.status,table="") except Exception as e: open("errorLog","a").write(str(e) + "\n") exc = ResponseException(str(e),StatusCode.CLIENT_ERROR,type(e)) interfaces.errorDb.writeFileError(jobId,manager.filename,ValidationError.jobError) return JsonResponse.error(exc,exc.status,table="") thread = Thread(target=ThreadedFunction, args= (jobId,)) try : jobTracker.markJobStatus(jobId,"running") except Exception as e: open("errorLog","a").write(str(e) + "\n") exc = ResponseException(str(e),StatusCode.INTERNAL_ERROR,type(e)) return JsonResponse.error(exc,exc.status,table="could not start job") interfaces.close() thread.start() return JsonResponse.create(StatusCode.OK,{"table":"job"+str(jobId)})
def validate(): """Start the validation process on the same threads.""" interfaces = InterfaceHolder() # Create sessions for this route try: return validationManager.validateJob(request,interfaces) except Exception as e: # Something went wrong getting the flask request open("errorLog","a").write(str(e) + "\n") exc = ResponseException(str(e),StatusCode.INTERNAL_ERROR,type(e)) return JsonResponse.error(exc,exc.status,table="") finally: interfaces.close()
def threadedValidateJob(self, jobId): """ args jobId -- (Integer) a valid jobId This method runs on a new thread thus there are zero error messages other then the job status being updated """ # As this is the start of a new thread, first generate new connections to the databases interfaces = InterfaceHolder() self.filename = "" jobTracker = interfaces.jobDb errorDb = interfaces.errorDb try: jobType = interfaces.jobDb.checkJobType(jobId) if jobType == interfaces.jobDb.getJobTypeId( "csv_record_validation"): self.runValidation(jobId, interfaces) elif jobType == interfaces.jobDb.getJobTypeId("validation"): self.runCrossValidation(jobId, interfaces) else: raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR) self.runValidation(jobId, interfaces) errorDb.markFileComplete(jobId, self.filename) return except ResponseException as e: CloudLogger.logError(str(e), e, traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId, jobTracker, "invalid", errorDb, self.filename, e.errorType, e.extraInfo) except ValueError as e: CloudLogger.logError(str(e), e, traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId, jobTracker, "invalid", errorDb, self.filename, ValidationError.unknownError) except Exception as e: #Something unknown happened we may need to try again! CloudLogger.logError(str(e), e, traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId, jobTracker, "failed", errorDb, self.filename, ValidationError.unknownError) finally: interfaces.close()
def setUpClass(cls): """Set up resources to be shared within a test class""" #TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['error_db_name'] = 'unittest{}_{}_error_data'.format( cls.num, suite) config['job_db_name'] = 'unittest{}_{}_job_tracker'.format( cls.num, suite) config['user_db_name'] = 'unittest{}_{}_user_manager'.format( cls.num, suite) config['validator_db_name'] = 'unittest{}_{}_validator'.format( cls.num, suite) config['staging_db_name'] = 'unittest{}_{}_staging'.format( cls.num, suite) dataactcore.config.CONFIG_DB = config app = createApp() app.config['TESTING'] = True cls.app = TestApp(app) # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Flag for each route call to launch a new thread cls.useThreads = False # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # suppress INFO-level logging from Alembic migrations logging.disable(logging.WARN) # drop and re-create test job db/tables setupJobTrackerDB() # drop and re-create test error db/tables setupErrorDB() # drop and re-create test staging db setupStagingDB() # drop and re-create test vaidation db setupValidationDB(True) # reset logging defaults logging.disable(logging.NOTSET) cls.interfaces = InterfaceHolder() cls.jobTracker = cls.interfaces.jobDb cls.stagingDb = cls.interfaces.stagingDb cls.errorInterface = cls.interfaces.errorDb cls.validationDb = cls.interfaces.validationDb cls.userId = 1
def threadedValidateJob(self,jobId) : """ args jobId -- (Integer) a valid jobId This method runs on a new thread thus there are zero error messages other then the job status being updated """ # As this is the start of a new thread, first generate new connections to the databases interfaces = InterfaceHolder() self.filename = "" jobTracker = interfaces.jobDb errorDb = interfaces.errorDb try: jobType = interfaces.jobDb.checkJobType(jobId) if jobType == interfaces.jobDb.getJobTypeId("csv_record_validation"): self.runValidation(jobId,interfaces) elif jobType == interfaces.jobDb.getJobTypeId("validation"): self.runCrossValidation(jobId, interfaces) else: raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR) self.runValidation(jobId, interfaces) errorDb.markFileComplete(jobId,self.filename) return except ResponseException as e: CloudLogger.logError(str(e),e,traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId,jobTracker,"invalid",errorDb,self.filename,e.errorType,e.extraInfo) except ValueError as e: CloudLogger.logError(str(e),e,traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId,jobTracker,"invalid",errorDb,self.filename,ValidationError.unknownError) except Exception as e: #Something unknown happened we may need to try again! CloudLogger.logError(str(e),e,traceback.extract_tb(sys.exc_info()[2])) self.markJob(jobId,jobTracker,"failed",errorDb,self.filename,ValidationError.unknownError) finally: interfaces.close()
def evaluateCrossFileRule(cls, rule, submissionId, record = None): """ Evaluate specified rule against all records to which it applies """ failures = [] # Can get multiple failures for these rule types rulePassed = True # Set to false on first failures # Get rule type ruleType = rule.multi_field_rule_type.name.lower() fileType = rule.file_type.name interfaces = InterfaceHolder() stagingDb = interfaces.stagingDb if ruleType == "field_match": targetType = rule.rule_text_2 # Get ORM objects for source and target staging tables sourceTable = cls.getTable(submissionId, fileType, stagingDb) targetTable = cls.getTable(submissionId, targetType, stagingDb) # TODO Could try to do a join and see what doesn't match, or otherwise improve performance by avoiding a # TODO new query against second table for every record in first table, possibly index second table at start # Can apply rule to a specified record or all records in first table sourceRecords = cls.getRecordsIfNone(sourceTable,stagingDb,record) fieldsToCheck = cls.cleanSplit(rule.rule_text_1,True) # For each entry, check for the presence of matching values in second table for thisRecord in sourceRecords: # Build query to filter for each field to match matchDict = {} query = stagingDb.session.query(targetTable) for field in fieldsToCheck: # Have to get file column IDs for source and target tables targetColId = interfaces.validationDb.getColumnId(field,targetType) if isinstance(thisRecord,dict): matchDict[field] = thisRecord[str(field)] else: sourceColId = interfaces.validationDb.getColumnId(field,fileType) matchDict[field] = getattr(thisRecord,str(sourceColId)) query = query.filter(getattr(targetTable.c,str(targetColId)) == matchDict[field]) # Make sure at least one in target table record matches if not query.first(): # Fields don't match target file, add to failures rulePassed = False dictString = str(matchDict)[1:-1] # Remove braces failures.append([", ".join(fieldsToCheck),rule.description,dictString]) elif ruleType == "rule_if": # Get all records from source table sourceTable = cls.getTable(submissionId, fileType, stagingDb) columns = list(sourceTable.columns) colNames = [] for i in range(0,len(columns)): try: int(columns[i].name) except ValueError: # Each staging table has a primary key field that is not an int, just include this directly colNames.append(columns[i].name) else: # If it is an int, treat it as a column id colNames.append(interfaces.validationDb.getFieldNameByColId(columns[i].name)) # Can apply rule to a specified record or all records in first table sourceRecords = cls.getRecordsIfNone(sourceTable,stagingDb,record) # Get both rules, condition to check and rule to apply based on condition condition = interfaces.validationDb.getMultiFieldRuleByLabel(rule.rule_text_2) conditionalRule = interfaces.validationDb.getMultiFieldRuleByLabel(rule.rule_text_1) # Apply first rule for all records that pass second rule for record in sourceRecords: # Record is a tuple, we need it to be a dict with field names as keys recordDict = dict(zip(colNames,list(record))) if cls.evaluateCrossFileRule(condition,submissionId,recordDict)[0]: result = cls.evaluateCrossFileRule(conditionalRule,submissionId,recordDict) if not result[0]: # Record if we have seen a failure rulePassed = False failures.extend(result[1]) elif ruleType == "greater": if not record: # Must provide a record for this rule raise ValueError("Cannot apply greater rule without a record") rulePassed = int(record[rule.rule_text_2]) > int(rule.rule_text_1) return rulePassed,failures