Ejemplo n.º 1
0
 def checkStartsWith(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str):
     # enumerated field check
     if (MetaUtils.exists(meta_attribute_definition, "StartsWith")):
         # startsWith is expected to be a list
         startsWith = meta_attribute_definition["StartsWith"]
         
         # check that the value exists within the provided list. If the value is blank then ignore it 
         # as we should have picked it up in the mandatory/optional test anyway
         # (i.e. if the field is optional but a value has been provided then we check it against the supplied list)
         if ( (len(value)>0) and (value != "(Null)") ):
             found = False
             for s in startsWith:
                 if value.startswith(s):
                     found = True
                     break
                     
             if (not found):
                 self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not begin with any of: '" + str(startsWith) + "'", primary_key_value=primary_key_value))
Ejemplo n.º 2
0
    def checkComposite(self, meta_attribute_definition: dict,
                       meta_attribute_key: str):
        # unique field check
        if (MetaUtils.exists(meta_attribute_definition, "Composite")):
            # sum the number of times value appears in the row. this is faster than using list.count(value)
            list_of_attribute_keys = meta_attribute_definition["Composite"]
            attribute_keys = '+'.join(map(str, list_of_attribute_keys))
            attribute_keys = attribute_keys.replace("%1", meta_attribute_key)

            # populate a dictionary of just the values that are required to create the composite meta_attribute_key
            attribute_data = {}
            for col in list_of_attribute_keys:
                col = col.replace("%1", meta_attribute_key)
                attribute_data[col] = SQLTools.getColValues(self.dataset, col)

            seen = set()
            rowindex = 0
            # convert the dictionary of columns into a list of tuples
            fields = [
                dict(zip(attribute_data, col))
                for col in zip(*attribute_data.values())
            ]

            # check to see if there is are any duplicates in the order of attribute_keys provided
            for row in fields:
                # join the values from the columns that make up the composite meta_attribute_key to form a single value
                composite_key = '|'.join(map(str, row.values()))
                if (composite_key in seen):
                    self.addDataQualityError(
                        DataQualityError(
                            meta_attribute_key,
                            error_dimension=DataQualityDimension.UNIQUENESS.
                            value,
                            description=
                            "Error: Duplicate composite meta_attribute_key: '"
                            + attribute_keys + "', values: '" + composite_key +
                            "'",
                            primary_key_value=composite_key))
                else:
                    seen.add(composite_key)
Ejemplo n.º 3
0
    def evaluateExpression(self, meta_attribute_definition:dict, meta_attribute_key:str):
        # evaluate any custom expressions
        if (MetaUtils.exists(meta_attribute_definition, "Expression")):
            expr = meta_attribute_definition["Expression"]
            
            # %1 is a placeholder for whatever the column name is owning the expression (it's just a shortcut)
            expr = expr.replace("%1", "[" + meta_attribute_key + "]")
            exp = ExpressionBuilder()
            
            fields = exp.parseExpr(expr)
            colData = dict()
            
            # grab all of the columns that we need and store in a local dict
            for field in fields:
                
                # grab the column data out of the resultset
                values = SQLTools.getCol(self.dataset, field)
                
                # if the column couldn't be found then we have a configuration issue so raise an exception
                if (values is None):
                    raise ValidationError("Error evaluating expression: '" + expr + "'. Unable to find column '" + field + "' in the resultset", None)
                
                colData.update(values)
            
            # convert the seperate columns into an array of name,value pairs
            pairs=[dict(zip(colData, col)) for col in zip(*colData.values())]
            
            for pair in pairs:
                result=None
                ev = exp.merge(expr,pair)
                
                try:
                    result = eval(ev)
                except Exception as e:                    
                    self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned an error '" + str(e) + "'"))
                    result=None

                if ( (not result is None) and (result == False) ):
                    self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned FALSE"))
Ejemplo n.º 4
0
    def validate(self:object, customValidator:str=None):
        """
        Validate a resultset against predefined metadata based on the LANG rules of data quality.
        """
        if (self.metadata is None):
            raise ValidationError("LANG Exception: meta-data has not been set", None)
        elif (self.dataset is None):
            raise ValidationError("LANG Exception: resultset has not been set", None)

        """
        Change request: find and output the primary key in the error report file if specified
        """
        primary_key = ""
        primary_key_values = None
        
        for key, item in self.metadata.items():                
            if (MetaUtils.isTrue(item, "PrimaryKey")):
                primary_key = key
                primary_key_values = self.dataset[primary_key]
                break
                
        """
        Execute a series of validations against the supplied column of data and the metadata for the column.
        Which validation is run is determined by entries in the metadata.
        """         
        for meta_attribute_key, meta_attribute_definition in self.metadata.items():                
            if (meta_attribute_key in self.dataset):
                print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r')
                                
                attribute = self.dataset[meta_attribute_key]
                                
                for row_count in range(len(attribute)):
                    value = attribute[row_count]
                    
                    """ 
                    If a primarykey tag has been found then output the value so that the user 
                     has a reference to search for the record in the source system. 
                     If there is no primary key attribute set then output the row count 
                    """
                    
                    if (not primary_key_values is None):
                        primary_key_value = primary_key_values[row_count]
                    else:
                        primary_key_value = "Row: " + str(row_count+1)
                    
                    self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value)                  
                    self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value)

                
                # format check (must provide a regex)
                if (MetaUtils.exists(meta_attribute_definition, "Format")):
                    re.purge()
                    regex=re.compile(meta_attribute_definition["Format"])
                    
                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]
                        
                        isMatch = (not regex.match(value) is None)
                        
                        if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ):
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'"))

                   
                # unique field check        
                if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ):
                    # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0
                    seen = set()          

                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]

                        if (not value in seen):
                            seen.add(value) #only process a value once 
                        else:    
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected."))
                            
                self.checkComposite(meta_attribute_definition, meta_attribute_key)
                
                # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset
                self.evaluateExpression(meta_attribute_definition, meta_attribute_key)

                print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.")
            else:
                self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset."))
        
        # only invoke the custom validator if one has been provoded
        if (not customValidator is None and len(customValidator) > 0):
            self.customValidator(customValidator)
Ejemplo n.º 5
0
 def checkSize(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str):
     # field length check
     if (MetaUtils.exists(meta_attribute_definition, "Size")):
         if ( (len(value) > int(meta_attribute_definition["Size"])) and (not MetaUtils.isBlankOrNull(value)) ):
             self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is longer than size '" + str(meta_attribute_definition["Size"]) + "'", primary_key_value=primary_key_value))
Ejemplo n.º 6
0
    def profileData(self, meta_attribute_definition:dict, colData:list, key:str) ->dict:
        """
        For a given column, calculate a variety of statistics.
        """

        if (colData is None):
            raise ValidationError("LANG Exception: DataSet has not been set", None)
        
        if (meta_attribute_definition is None):
            raise ValidationError("LANG Exception: metadata has not been set", None)
        
        self.attribute = key
        self.count = len(colData)

        if (MetaUtils.exists(meta_attribute_definition, "Type")):
            self.type = meta_attribute_definition["Type"]

        if (self.type == "string"):
            cleaned = list(map(self.clean_string, colData))
            try:
                v = CountVectorizer().fit_transform(cleaned)
                vectors = v.toarray()
                self.csim = round(self.cosine_sim_vectors(vectors[0], vectors[1]), 3)
                
            except Exception as e:
                pass
        
        try:
            mode = stats.mode(colData)
            if (len(mode[0]) > 0):
                self.most_frequent_value = mode.mode[0]
                self.most_frequent_count = mode.count[0]
            
            counter = collections.Counter(colData)
            self.least_frequent_value = min(colData[::-1], key=counter.get)
            self.least_frequent_count = counter[self.least_frequent_value]
    
        except Exception as e:
            #on very large datasets, Numpy may throw an out of memory error. 
            mode = -1
            print(e)
            
        vals = []

        s=set(colData)
        s.discard("")
        self.patterns = str(sorted(s))
        self.patternCount = len(s)
        
        if (MetaUtils.exists(meta_attribute_definition, "Default")):
            self.default_value = meta_attribute_definition["Default"]
        else:
            self.default_value = "<Unspecified>"
    
        for value in colData:
            self.memory += len(value)
            
            if (len(value) == 0):
                self.blankCount += 1
            elif (value == self.default_value):
                self.default_count += 1
            elif (value == "(Null)"):
                self.nullCount += 1
                

            if (len(value) < self.min_len or self.min_len == -1):
                self.min_len = len(value)
        
            if(len(value) > self.max_len or self.max_len == -1):
                self.max_len = len(value)


            val= math.nan
                    
            try:
                if (self.type in ["int","integer"]):
                    val = int(value)
                elif (self.type in ["float","number"]):
                    val = float(value)
            except Exception as e:
                pass

     
            if (not math.isnan(val)):
                self.negative_count += (val < 0)                
                self.sum += val
                vals.append(val)

                if (val < self.min_val or self.min_val == -1):
                    self.min_val = val
            
                if(val > self.max_val or self.max_val == -1):
                    self.max_val = val
                    
                        
        if (len(vals)>0):                  
            self.mean = statistics.mean(vals)                
            self.median = statistics.median(vals)
        
        if (len(vals)>=2):
            self.stddev = statistics.stdev(vals)
            self.variance = statistics.variance(vals)
        
        return self.to_dict()