def normalize_data(df, target_fields): """Normalizes data before starting the cleansing""" checker_document = CheckerDocument() for field_code, field_data in target_fields.items(): ref_type_id = field_data.get("ref_type_id") if ref_type_id: conditions = {"ref_type_id": ref_type_id} ref_values = checker_document.get_ref_value(conditions, "code", alias=True) df[field_code] = df[field_code].apply(replace_alias, args=(ref_values, ))
def get_ref_list(self, check, ref_type_id): """Fetches the references values list from the database""" list_values = check.get("values") if list_values: return list_values field_name = check.get("field_name") conditions = {"ref_type_id": ref_type_id} conditions.update(check.get("conditions", {})) checker_document = CheckerDocument() ref_values = checker_document.get_ref_value(conditions, field_name) return ref_values
def check_column(self, df, column, empty_column, *args, **kwargs): """Checks if a given column matches the business requirements""" if not empty_column.all(): check = kwargs.get("check") list_values = check.get("values") if list_values: return empty_column | df[column].isin(list_values) field_name = check.get("field_name") conditions = {"ref_type_id": kwargs.get("ref_type_id")} conditions.update(check.get("conditions", {})) checker_document = CheckerDocument() ref_values = checker_document.get_ref_value(conditions, field_name) return empty_column | df[column].str.lower().isin( {ref_value.lower() for ref_value in ref_values})