def checkStartsWith(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str): # enumerated field check if (MetaUtils.exists(meta_attribute_definition, "StartsWith")): # startsWith is expected to be a list startsWith = meta_attribute_definition["StartsWith"] # check that the value exists within the provided list. If the value is blank then ignore it # as we should have picked it up in the mandatory/optional test anyway # (i.e. if the field is optional but a value has been provided then we check it against the supplied list) if ( (len(value)>0) and (value != "(Null)") ): found = False for s in startsWith: if value.startswith(s): found = True break if (not found): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not begin with any of: '" + str(startsWith) + "'", primary_key_value=primary_key_value))
def checkComposite(self, meta_attribute_definition: dict, meta_attribute_key: str): # unique field check if (MetaUtils.exists(meta_attribute_definition, "Composite")): # sum the number of times value appears in the row. this is faster than using list.count(value) list_of_attribute_keys = meta_attribute_definition["Composite"] attribute_keys = '+'.join(map(str, list_of_attribute_keys)) attribute_keys = attribute_keys.replace("%1", meta_attribute_key) # populate a dictionary of just the values that are required to create the composite meta_attribute_key attribute_data = {} for col in list_of_attribute_keys: col = col.replace("%1", meta_attribute_key) attribute_data[col] = SQLTools.getColValues(self.dataset, col) seen = set() rowindex = 0 # convert the dictionary of columns into a list of tuples fields = [ dict(zip(attribute_data, col)) for col in zip(*attribute_data.values()) ] # check to see if there is are any duplicates in the order of attribute_keys provided for row in fields: # join the values from the columns that make up the composite meta_attribute_key to form a single value composite_key = '|'.join(map(str, row.values())) if (composite_key in seen): self.addDataQualityError( DataQualityError( meta_attribute_key, error_dimension=DataQualityDimension.UNIQUENESS. value, description= "Error: Duplicate composite meta_attribute_key: '" + attribute_keys + "', values: '" + composite_key + "'", primary_key_value=composite_key)) else: seen.add(composite_key)
def evaluateExpression(self, meta_attribute_definition:dict, meta_attribute_key:str): # evaluate any custom expressions if (MetaUtils.exists(meta_attribute_definition, "Expression")): expr = meta_attribute_definition["Expression"] # %1 is a placeholder for whatever the column name is owning the expression (it's just a shortcut) expr = expr.replace("%1", "[" + meta_attribute_key + "]") exp = ExpressionBuilder() fields = exp.parseExpr(expr) colData = dict() # grab all of the columns that we need and store in a local dict for field in fields: # grab the column data out of the resultset values = SQLTools.getCol(self.dataset, field) # if the column couldn't be found then we have a configuration issue so raise an exception if (values is None): raise ValidationError("Error evaluating expression: '" + expr + "'. Unable to find column '" + field + "' in the resultset", None) colData.update(values) # convert the seperate columns into an array of name,value pairs pairs=[dict(zip(colData, col)) for col in zip(*colData.values())] for pair in pairs: result=None ev = exp.merge(expr,pair) try: result = eval(ev) except Exception as e: self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned an error '" + str(e) + "'")) result=None if ( (not result is None) and (result == False) ): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned FALSE"))
def validate(self:object, customValidator:str=None): """ Validate a resultset against predefined metadata based on the LANG rules of data quality. """ if (self.metadata is None): raise ValidationError("LANG Exception: meta-data has not been set", None) elif (self.dataset is None): raise ValidationError("LANG Exception: resultset has not been set", None) """ Change request: find and output the primary key in the error report file if specified """ primary_key = "" primary_key_values = None for key, item in self.metadata.items(): if (MetaUtils.isTrue(item, "PrimaryKey")): primary_key = key primary_key_values = self.dataset[primary_key] break """ Execute a series of validations against the supplied column of data and the metadata for the column. Which validation is run is determined by entries in the metadata. """ for meta_attribute_key, meta_attribute_definition in self.metadata.items(): if (meta_attribute_key in self.dataset): print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r') attribute = self.dataset[meta_attribute_key] for row_count in range(len(attribute)): value = attribute[row_count] """ If a primarykey tag has been found then output the value so that the user has a reference to search for the record in the source system. If there is no primary key attribute set then output the row count """ if (not primary_key_values is None): primary_key_value = primary_key_values[row_count] else: primary_key_value = "Row: " + str(row_count+1) self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value) # format check (must provide a regex) if (MetaUtils.exists(meta_attribute_definition, "Format")): re.purge() regex=re.compile(meta_attribute_definition["Format"]) for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] isMatch = (not regex.match(value) is None) if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'")) # unique field check if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ): # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0 seen = set() for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] if (not value in seen): seen.add(value) #only process a value once else: self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected.")) self.checkComposite(meta_attribute_definition, meta_attribute_key) # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset self.evaluateExpression(meta_attribute_definition, meta_attribute_key) print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.") else: self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset.")) # only invoke the custom validator if one has been provoded if (not customValidator is None and len(customValidator) > 0): self.customValidator(customValidator)
def checkSize(self, meta_attribute_definition:dict, meta_attribute_key:str, value:str, primary_key_value:str): # field length check if (MetaUtils.exists(meta_attribute_definition, "Size")): if ( (len(value) > int(meta_attribute_definition["Size"])) and (not MetaUtils.isBlankOrNull(value)) ): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Value '" + value + "' is longer than size '" + str(meta_attribute_definition["Size"]) + "'", primary_key_value=primary_key_value))
def profileData(self, meta_attribute_definition:dict, colData:list, key:str) ->dict: """ For a given column, calculate a variety of statistics. """ if (colData is None): raise ValidationError("LANG Exception: DataSet has not been set", None) if (meta_attribute_definition is None): raise ValidationError("LANG Exception: metadata has not been set", None) self.attribute = key self.count = len(colData) if (MetaUtils.exists(meta_attribute_definition, "Type")): self.type = meta_attribute_definition["Type"] if (self.type == "string"): cleaned = list(map(self.clean_string, colData)) try: v = CountVectorizer().fit_transform(cleaned) vectors = v.toarray() self.csim = round(self.cosine_sim_vectors(vectors[0], vectors[1]), 3) except Exception as e: pass try: mode = stats.mode(colData) if (len(mode[0]) > 0): self.most_frequent_value = mode.mode[0] self.most_frequent_count = mode.count[0] counter = collections.Counter(colData) self.least_frequent_value = min(colData[::-1], key=counter.get) self.least_frequent_count = counter[self.least_frequent_value] except Exception as e: #on very large datasets, Numpy may throw an out of memory error. mode = -1 print(e) vals = [] s=set(colData) s.discard("") self.patterns = str(sorted(s)) self.patternCount = len(s) if (MetaUtils.exists(meta_attribute_definition, "Default")): self.default_value = meta_attribute_definition["Default"] else: self.default_value = "<Unspecified>" for value in colData: self.memory += len(value) if (len(value) == 0): self.blankCount += 1 elif (value == self.default_value): self.default_count += 1 elif (value == "(Null)"): self.nullCount += 1 if (len(value) < self.min_len or self.min_len == -1): self.min_len = len(value) if(len(value) > self.max_len or self.max_len == -1): self.max_len = len(value) val= math.nan try: if (self.type in ["int","integer"]): val = int(value) elif (self.type in ["float","number"]): val = float(value) except Exception as e: pass if (not math.isnan(val)): self.negative_count += (val < 0) self.sum += val vals.append(val) if (val < self.min_val or self.min_val == -1): self.min_val = val if(val > self.max_val or self.max_val == -1): self.max_val = val if (len(vals)>0): self.mean = statistics.mean(vals) self.median = statistics.median(vals) if (len(vals)>=2): self.stddev = statistics.stdev(vals) self.variance = statistics.variance(vals) return self.to_dict()