Ejemplo n.º 1
0
    def checkType(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str):
        # field type check
        is_valid_type = True

        if (MetaUtils.exists(meta_attribute_definition, "Type")):
            # if a default value has been specified then ignore the type check if the value matches the default
            if (MetaUtils.exists(meta_attribute_definition, "Default")):
                if (value==meta_attribute_definition["Default"]):
                    is_valid_type = False
            
            if (meta_attribute_definition["Type"] in ["int","integer"]):
                if ( (MetaUtils.isBlankOrNull(value)) or (not MetaUtils.isInt(value)) ):
                    if (not MetaUtils.isAllowBlank(meta_attribute_definition)):
                        self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not an int. An int was expected",primary_key_value=primary_key_value))
                        is_valid_type = False
            elif (meta_attribute_definition["Type"] in ["float","number"]):
                if ( (MetaUtils.isBlankOrNull(value)) or (not MetaUtils.isFloat(value)) ): 
                    if (not MetaUtils.isAllowBlank(meta_attribute_definition)):
                        self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not a float. A float was expected",primary_key_value=primary_key_value))
                        is_valid_type = False
            elif (meta_attribute_definition["Type"] in ["bool","boolean"]):
                if ( (MetaUtils.isBlankOrNull(value)) or (not value.lower() in ["false", "true", "f", "t", "n", "y", "no", "yes", "0", "1"]) ): 
                    if (not MetaUtils.isAllowBlank(meta_attribute_definition)):
                        self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is not a boolean. A boolean was expected",primary_key_value=primary_key_value))
                        is_valid_type = False
                    
            # given that min and max checks only apply to int and float values we may as well test for them now
            if (is_valid_type):
                self.checkMinMax(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
Ejemplo n.º 2
0
 def checkMandatory(self, meta_attribute_definition: dict,
                    meta_attribute_key: str, value: str,
                    primary_key_value: str):
     # mandatory field check
     if (MetaUtils.isTrue(meta_attribute_definition, "Mandatory")):
         if ((MetaUtils.isBlankOrNull(value)) and
             (not MetaUtils.isAllowBlank(meta_attribute_definition))):
             self.addDataQualityError(
                 DataQualityError(
                     meta_attribute_key,
                     error_dimension=DataQualityDimension.
                     COMPLETENESSMANDATORY.value,
                     description=
                     "Error: Mandatory field is BLANK or NULL. A value is required.",
                     primary_key_value=primary_key_value))
     else:
         # optional field check. According to LANG optional fields shpuld contain some sort of default value
         # i.e. no field shpould ever be blank or NULL.
         if ((MetaUtils.isBlankOrNull(value)) and
             (not MetaUtils.isAllowBlank(meta_attribute_definition))):
             self.addDataQualityError(
                 DataQualityError(
                     meta_attribute_key,
                     error_dimension=DataQualityDimension.
                     COMPLETENESSOPTIONAL.value,
                     description=
                     "Error: Optional field is BLANK or NULL. A default value is required.",
                     primary_key_value=primary_key_value))
Ejemplo n.º 3
0
    def validate(self:object, customValidator:str=None):
        """
        Validate a resultset against predefined metadata based on the LANG rules of data quality.
        """
        if (self.metadata is None):
            raise ValidationError("LANG Exception: meta-data has not been set", None)
        elif (self.dataset is None):
            raise ValidationError("LANG Exception: resultset has not been set", None)

        """
        Change request: find and output the primary key in the error report file if specified
        """
        primary_key = ""
        primary_key_values = None
        
        for key, item in self.metadata.items():                
            if (MetaUtils.isTrue(item, "PrimaryKey")):
                primary_key = key
                primary_key_values = self.dataset[primary_key]
                break
                
        """
        Execute a series of validations against the supplied column of data and the metadata for the column.
        Which validation is run is determined by entries in the metadata.
        """         
        for meta_attribute_key, meta_attribute_definition in self.metadata.items():                
            if (meta_attribute_key in self.dataset):
                print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r')
                                
                attribute = self.dataset[meta_attribute_key]
                                
                for row_count in range(len(attribute)):
                    value = attribute[row_count]
                    
                    """ 
                    If a primarykey tag has been found then output the value so that the user 
                     has a reference to search for the record in the source system. 
                     If there is no primary key attribute set then output the row count 
                    """
                    
                    if (not primary_key_values is None):
                        primary_key_value = primary_key_values[row_count]
                    else:
                        primary_key_value = "Row: " + str(row_count+1)
                    
                    self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value)                  
                    self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value)

                
                # format check (must provide a regex)
                if (MetaUtils.exists(meta_attribute_definition, "Format")):
                    re.purge()
                    regex=re.compile(meta_attribute_definition["Format"])
                    
                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]
                        
                        isMatch = (not regex.match(value) is None)
                        
                        if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ):
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'"))

                   
                # unique field check        
                if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ):
                    # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0
                    seen = set()          

                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]

                        if (not value in seen):
                            seen.add(value) #only process a value once 
                        else:    
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected."))
                            
                self.checkComposite(meta_attribute_definition, meta_attribute_key)
                
                # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset
                self.evaluateExpression(meta_attribute_definition, meta_attribute_key)

                print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.")
            else:
                self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset."))
        
        # only invoke the custom validator if one has been provoded
        if (not customValidator is None and len(customValidator) > 0):
            self.customValidator(customValidator)