def test_singlecolumns_partial(self): data = pd.DataFrame() # create and assign columns to df l1 = [chr(i) for i in range(100)] l2 = [i for i in range(100)] l3 = [i / 0.7 for i in range(100)] for i in range(20): l1[i] = "" l2[i] = 0 l3[i] = 0. random.shuffle(l1) random.shuffle(l2) random.shuffle(l3) data["c1"] = l1 data["c2"] = l2 data["c3"] = l3 df = self.spark.createDataFrame(data) df = df.withColumn("c1", replace_empty_with_null(df["c1"])) df = df.withColumn("c2", replace_0_with_null(df["c2"])) df = df.withColumn("c3", replace_0dot_with_null(df["c3"])) r1, r2, r3 = completeness(["c1", "c2", "c3"], df) self.assertEqual(r1, 80.0) self.assertEqual(r2, 80.0) self.assertEqual(r3, 80.0)
def test_wholetable_full(self): data = pd.DataFrame() data["c1"] = [chr(i) for i in range(100)] data["c2"] = [i for i in range(100)] data["c3"] = [i / 0.7 for i in range(100)] df = self.spark.createDataFrame(data) r = completeness(df=df)[0] self.assertEqual(r, 100.0)
def test_wholetable_empty(self): data = pd.DataFrame() data["c1"] = [] data["c2"] = [] schema = [StructField("c1", IntegerType(), True), StructField("c2", StringType(), True)] df = self.spark.createDataFrame(data, StructType(schema)) r = completeness(df=df)[0] self.assertEqual(r, 100.)
def test_singlecolumns_empty(self): data = pd.DataFrame() data["c1"] = [] data["c2"] = [] schema = [StructField("c1", IntegerType(), True), StructField("c2", StringType(), True)] df = self.spark.createDataFrame(data, StructType(schema)) r1, r2 = completeness(["c1", "c2"], df) self.assertEqual(r1, 100.) self.assertEqual(r2, 100.)
def test_singlecolumns_full(self): data = pd.DataFrame() data["c1"] = [chr(i) for i in range(100)] data["c2"] = [i for i in range(100)] data["c3"] = [i / 0.7 for i in range(100)] df = self.spark.createDataFrame(data) r1, r2, r3 = completeness(["c1", "c2", "c3"], df) self.assertEqual(r1, 100.0) self.assertEqual(r2, 100.0) self.assertEqual(r3, 100.0)
def test_wholetable_allnull(self): data = pd.DataFrame() data["c1"] = [chr(i) for i in range(100)] data["c2"] = [i for i in range(100)] data["c3"] = [i / 0.7 for i in range(100)] df = self.spark.createDataFrame(data) df = df.withColumn("c1", replace_every_string_with_null(df["c1"])) df = df.withColumn("c2", replace_every_int_with_null(df["c2"])) df = df.withColumn("c3", replace_every_float_with_null(df["c3"])) r = completeness(df=df)[0] self.assertEqual(r, 0.0)
def test_singlecolumns_allnull(self): data = pd.DataFrame() data["c1"] = [chr(i) for i in range(100)] data["c2"] = [i for i in range(100)] data["c3"] = [i / 0.7 for i in range(100)] df = self.spark.createDataFrame(data) df = df.withColumn("c1", replace_every_string_with_null(df["c1"])) df = df.withColumn("c2", replace_every_int_with_null(df["c2"])) df = df.withColumn("c3", replace_every_float_with_null(df["c3"])) r1, r2, r3 = completeness(["c1", "c2", "c3"], df) self.assertEqual(r1, 0.0) self.assertEqual(r2, 0.0) self.assertEqual(r3, 0.0)
#!/usr/bin/python3 from pyspark.sql import SparkSession from haychecker.dhc.metrics import completeness spark = SparkSession.builder.appName("completeness_example").getOrCreate() df = spark.read.format("csv").option("header", "true").load("examples/resources/employees.csv") df.show() r1, r2 = completeness(["region", "reportsTo"], df) print("Completeness region: {}, completeness reportsTo: {}".format(r1, r2)) task1 = completeness(["region", "reportsTo"]) task2 = completeness(["city"]) task3 = task1.add(task2) result = task3.run(df) r1, r2 = result[0]["scores"] r3 = result[1]["scores"][0] print("Completeness region: {}, completeness reportsTo: {}, completeness city: {}".format(r1, r2, r3))
def test_groping_multile_columns(self): data = pd.DataFrame() c1 = [0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5] c2 = ["a", "a", "b", "b", "c", "c", "d", "d", "d", "d", "a", "a", "a"] c3 = [0.0, 0.0, 0.1, 0.1, 2.2, 2.2, 2.2, 3.1, 3.2, 3.3, 40, 40, 50] c4 = [ 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0 ] c5 = ["09:10:10" for _ in range(10)] c5.extend(["00:11:10" for _ in range(3)]) data["c1"] = c1 data["c2"] = c2 data["c3"] = c3 data["c4"] = c4 data["c5"] = c5 df = self.spark.createDataFrame(data) task = Task() task.add(completeness()) task.add(completeness([0, 1, 2])) task.add(deduplication([0, 1])) task.add(deduplication()) task.add(timeliness(["c5"], value="10:10:10", timeFormat="ss:mm:HH")) task.add(completeness()) condition1 = {"column": "c3", "operator": "lt", "value": 50} condition2 = {"column": "c3", "operator": "gt", "value": 1.0} conditions = [condition1, condition2] task.add(rule(conditions)) condition1 = {"column": "c5", "operator": "eq", "value": "00:11:10"} conditions = [condition1] task.add(rule(conditions)) condition1 = {"column": "c3", "operator": "lt", "value": 50} condition2 = {"column": "c3", "operator": "gt", "value": 1.0} conditions = [condition1, condition2] having1 = { "column": "*", "operator": "gt", "value": 1, "aggregator": "count" } having2 = { "column": "c4", "operator": "eq", "value": 50 / 3, "aggregator": "avg" } havings = [having1, having2] task.add(grouprule([0, "c2"], havings, conditions)) result = task.run(df) # c1 r = result[0]["scores"][0] self.assertEqual(r, 100.) # c2 r1, r2, r3 = result[1]["scores"] self.assertEqual(r1, 100.) self.assertEqual(r2, 100.) self.assertEqual(r3, 100.) # d1 r1, r2 = result[2]["scores"] self.assertEqual(r1, (6 / 13) * 100) self.assertEqual(r2, (4 / 13) * 100) # d2 r = result[3]["scores"][0] self.assertEqual(r, 100.) # t r = result[4]["scores"][0] self.assertEqual(r, (10 / 13) * 100) # c3 r = result[5]["scores"][0] self.assertEqual(r, 100.) # r1 r = result[6]["scores"][0] self.assertEqual(r, (8 / 13) * 100) # r2 r = result[7]["scores"][0] self.assertEqual(r, (3 / 13) * 100) # gr1 r = result[8]["scores"][0] self.assertEqual(r, 25.0)