Esempio n. 1
0
    def test_singlecolumns_partialnullspartialdistinct(self):
        data = pd.DataFrame()
        # create and assign columns to df
        l1 = [chr(i) for i in range(100)]
        l2 = [i for i in range(100)]
        l3 = [i / 0.7 for i in range(100)]
        for i in range(20):
            l1[i] = ""
            l2[i] = 0
            l3[i] = 0.
        for i in range(20, 40):
            l1[i] = "zzzzz"
            l2[i] = 500
            l3[i] = 402.2

        random.shuffle(l1)
        random.shuffle(l2)
        random.shuffle(l3)
        data["c1"] = l1
        data["c2"] = l2
        data["c3"] = l3

        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", replace_0_with_null(df["c2"]))
        df = df.withColumn("c3", replace_0dot_with_null(df["c3"]))

        r1, r2, r3 = deduplication_approximated(["c1", "c2", "c3"], df)
        self.assertAlmostEqual(r1, 61.0, delta=3.)
        self.assertAlmostEqual(r2, 61.0, delta=3.)
        self.assertAlmostEqual(r3, 61.0, delta=3.)
Esempio n. 2
0
    def test_singlecolumns_alldifferent(self):
        data = pd.DataFrame()
        data["c1"] = [chr(i) for i in range(100)]
        data["c2"] = [i for i in range(100)]
        data["c3"] = [i / 0.7 for i in range(100)]
        df = self.spark.createDataFrame(data)

        r1, r2, r3 = deduplication_approximated(["c1", "c2", "c3"], df)
        self.assertAlmostEqual(r1, 100.0, delta=5.)
        self.assertAlmostEqual(r2, 100.0, delta=5.)
        self.assertAlmostEqual(r3, 100.0, delta=5.)
Esempio n. 3
0
    def test_singlecolumns_allsame(self):
        data = pd.DataFrame()
        data["c1"] = [chr(0) for _ in range(100)]
        data["c2"] = [10 for _ in range(100)]
        data["c3"] = [20 / 0.7 for _ in range(100)]
        df = self.spark.createDataFrame(data)

        r1, r2, r3 = deduplication_approximated(["c1", "c2", "c3"], df)
        self.assertAlmostEqual(r1, 1.0, delta=0.05)
        self.assertAlmostEqual(r2, 1.0, delta=0.05)
        self.assertAlmostEqual(r3, 1.0, delta=0.05)
Esempio n. 4
0
    def test_singlecolumns_empty(self):
        data = pd.DataFrame()
        data["c1"] = []
        data["c2"] = []
        schema = [
            StructField("c1", IntegerType(), True),
            StructField("c2", StringType(), True)
        ]
        df = self.spark.createDataFrame(data, StructType(schema))

        r1, r2 = deduplication_approximated(["c1", "c2"], df)
        self.assertAlmostEqual(r1, 100., delta=5.)
        self.assertAlmostEqual(r2, 100., delta=5.)
Esempio n. 5
0
    def test_singlecolumns_allnull(self):
        data = pd.DataFrame()
        data["c1"] = [chr(i) for i in range(100)]
        data["c2"] = [i for i in range(100)]
        data["c3"] = [i / 0.7 for i in range(100)]
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_every_string_with_null(df["c1"]))
        df = df.withColumn("c2", replace_every_int_with_null(df["c2"]))
        df = df.withColumn("c3", replace_every_float_with_null(df["c3"]))

        r1, r2, r3 = deduplication_approximated(["c1", "c2", "c3"], df)
        self.assertEqual(r1, 0.0)
        self.assertEqual(r2, 0.0)
        self.assertEqual(r3, 0.0)
Esempio n. 6
0
    def test_singlecolumns_partial(self):
        data = pd.DataFrame()
        # create and assign columns to df
        l1 = [chr(i) for i in range(100)]
        l2 = [i for i in range(100)]
        l3 = [i / 0.7 for i in range(100)]
        for i in range(20):
            l1[i] = ""
            l2[i] = 0
            l3[i] = 0.
        random.shuffle(l1)
        random.shuffle(l2)
        random.shuffle(l3)
        data["c1"] = l1
        data["c2"] = l2
        data["c3"] = l3

        df = self.spark.createDataFrame(data)
        r1, r2, r3 = deduplication_approximated(["c1", "c2", "c3"], df)
        self.assertAlmostEqual(r1, 81.0, delta=4.)
        self.assertAlmostEqual(r2, 81.0, delta=4.)
        self.assertAlmostEqual(r3, 81.0, delta=4.)
#!/usr/bin/python3
from pyspark.sql import SparkSession

from haychecker.dhc.metrics import deduplication_approximated

spark = SparkSession.builder.appName(
    "Deduplication_approximated_example").getOrCreate()

df = spark.read.format("csv").option(
    "header", "true").load("examples/resources/employees.csv")

df.show()

r1, r2 = deduplication_approximated(["title", "city"], df)

print(
    "Deduplication_approximated title: {}, deduplication_approximated city: {}"
    .format(r1, r2))

task1 = deduplication_approximated(["title", "city"])
task2 = deduplication_approximated(["lastName"])
task3 = task1.add(task2)

result = task3.run(df)

r1, r2 = result[0]["scores"]
r3 = result[1]["scores"][0]

print(
    "Deduplication_approximated title: {}, deduplication_approximated city: {}, "
    "deduplication_approximated lastName: {}".format(r1, r2, r3))