def checkType(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str): # field type check is_valid_type = True if (MetaUtils.exists(meta_attribute_definition, "Type")): # if a default value has been specified then ignore the type check if the value matches the default if (MetaUtils.exists(meta_attribute_definition, "Default")): if (value==meta_attribute_definition["Default"]): is_valid_type = False if (meta_attribute_definition["Type"] in ["int","integer"]): if ( (MetaUtils.isBlankOrNull(value)) or (not MetaUtils.isInt(value)) ): if (not MetaUtils.isAllowBlank(meta_attribute_definition)): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not an int. An int was expected",primary_key_value=primary_key_value)) is_valid_type = False elif (meta_attribute_definition["Type"] in ["float","number"]): if ( (MetaUtils.isBlankOrNull(value)) or (not MetaUtils.isFloat(value)) ): if (not MetaUtils.isAllowBlank(meta_attribute_definition)): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not a float. A float was expected",primary_key_value=primary_key_value)) is_valid_type = False elif (meta_attribute_definition["Type"] in ["bool","boolean"]): if ( (MetaUtils.isBlankOrNull(value)) or (not value.lower() in ["false", "true", "f", "t", "n", "y", "no", "yes", "0", "1"]) ): if (not MetaUtils.isAllowBlank(meta_attribute_definition)): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not a boolean. A boolean was expected",primary_key_value=primary_key_value)) is_valid_type = False # given that min and max checks only apply to int and float values we may as well test for them now if (is_valid_type): self.checkMinMax(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
def checkMandatory(self, meta_attribute_definition: dict, meta_attribute_key: str, value: str, primary_key_value: str): # mandatory field check if (MetaUtils.isTrue(meta_attribute_definition, "Mandatory")): if ((MetaUtils.isBlankOrNull(value)) and (not MetaUtils.isAllowBlank(meta_attribute_definition))): self.addDataQualityError( DataQualityError( meta_attribute_key, error_dimension=DataQualityDimension. COMPLETENESSMANDATORY.value, description= "Error: Mandatory field is BLANK or NULL. A value is required.", primary_key_value=primary_key_value)) else: # optional field check. According to LANG optional fields shpuld contain some sort of default value # i.e. no field shpould ever be blank or NULL. if ((MetaUtils.isBlankOrNull(value)) and (not MetaUtils.isAllowBlank(meta_attribute_definition))): self.addDataQualityError( DataQualityError( meta_attribute_key, error_dimension=DataQualityDimension. COMPLETENESSOPTIONAL.value, description= "Error: Optional field is BLANK or NULL. A default value is required.", primary_key_value=primary_key_value))
def validate(self:object, customValidator:str=None): """ Validate a resultset against predefined metadata based on the LANG rules of data quality. """ if (self.metadata is None): raise ValidationError("LANG Exception: meta-data has not been set", None) elif (self.dataset is None): raise ValidationError("LANG Exception: resultset has not been set", None) """ Change request: find and output the primary key in the error report file if specified """ primary_key = "" primary_key_values = None for key, item in self.metadata.items(): if (MetaUtils.isTrue(item, "PrimaryKey")): primary_key = key primary_key_values = self.dataset[primary_key] break """ Execute a series of validations against the supplied column of data and the metadata for the column. Which validation is run is determined by entries in the metadata. """ for meta_attribute_key, meta_attribute_definition in self.metadata.items(): if (meta_attribute_key in self.dataset): print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r') attribute = self.dataset[meta_attribute_key] for row_count in range(len(attribute)): value = attribute[row_count] """ If a primarykey tag has been found then output the value so that the user has a reference to search for the record in the source system. If there is no primary key attribute set then output the row count """ if (not primary_key_values is None): primary_key_value = primary_key_values[row_count] else: primary_key_value = "Row: " + str(row_count+1) self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value) # format check (must provide a regex) if (MetaUtils.exists(meta_attribute_definition, "Format")): re.purge() regex=re.compile(meta_attribute_definition["Format"]) for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] isMatch = (not regex.match(value) is None) if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'")) # unique field check if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ): # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0 seen = set() for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] if (not value in seen): seen.add(value) #only process a value once else: self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected.")) self.checkComposite(meta_attribute_definition, meta_attribute_key) # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset self.evaluateExpression(meta_attribute_definition, meta_attribute_key) print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.") else: self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset.")) # only invoke the custom validator if one has been provoded if (not customValidator is None and len(customValidator) > 0): self.customValidator(customValidator)