def test_accumulated_report(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"},
                              accumulatedReport=True)
     check.run([reporter])
     reporter.sendAccumulatedReport()
     reporter.sendAccumulatedReport("111")
Example #2
0
    def test_passed_args(self):
        display_name = "display name"
        id = "id"
        cache_method = StorageLevel.DISK_ONLY
        check = Check(self.df, display_name, cache_method, id)

        # check wrapper
        self.assertEqual(check.name, display_name)
        self.assertEqual(check.id, id)
        self.assertEqual(check.cacheMethod, cache_method)

        # check jvm check
        self.assertEqual(check.jvmCheck.getClass().toString(),
                         "class de.frosner.ddq.core.Check")
        self.assertEqual(check.jvmCheck.name(), check.name)
        self.assertEqual(check.jvmCheck.id(), check.id)
        jvm_cache_method = check.jvmCheck.cacheMethod().get()
        self.assertEqual(jvm_cache_method.useDisk(), check.cacheMethod.useDisk)
        self.assertEqual(jvm_cache_method.useMemory(),
                         check.cacheMethod.useMemory)
        self.assertEqual(jvm_cache_method.useOffHeap(),
                         check.cacheMethod.useOffHeap)
        self.assertEqual(jvm_cache_method.deserialized(),
                         check.cacheMethod.deserialized)
        self.assertEqual(jvm_cache_method.replication(),
                         check.cacheMethod.replication)
Example #3
0
 def test_default_args(self):
     df = Mock()
     check = Check(df)
     ddq_check = check._jvm.de.frosner.ddq.core.Check
     ddq_check.assert_called_with(
         df._jdf,
         getattr(ddq_check, "apply$default$2")(),
         getattr(ddq_check, "apply$default$3")(),
         getattr(ddq_check, "apply$default$4")(),
         getattr(ddq_check, "apply$default$5")(),
     )
Example #4
0
    def test_default_args(self):
        check = Check(self.df)
        self.assertEqual(check.name, "DataFrame[_1: bigint, _2: string]")
        self.assertEqual(check.cacheMethod, None)
        try:
            UUID(check.id, version=4)
        except ValueError:
            raise self.fail("id is not a correct uuid4")

        self.assertEqual(check.jvmCheck.getClass().toString(),
                         "class de.frosner.ddq.core.Check")
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = ConsoleReporter(baos)
        check.run([reporter])
        expected_output = """
\x1b[34mChecking [_1: bigint, _2: string]\x1b[0m
\x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m
\x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m
\x1b[32m- Columns _1, _2 are a key.\x1b[0m
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
Example #6
0
    def test_satisfies(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "a"), (3, "a")])
        check = Check(df).satisfies("_1 > 0").satisfies(df._2 == 'a')
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Constraint _1 > 0 is satisfied.
- *SUCCESS*: Constraint (_2 = a) is satisfied.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #7
0
    def test_hasUniqueKey(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = MarkdownReporter(baos)
        check.run([reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
    def test_hasNumRowsLessThan(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasNumRowsLessThan(2).hasNumRowsLessThan(10)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: The actual number of rows 3 does not satisfy (count < 2).
- *SUCCESS*: The number of rows satisfies (count < 10).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #10
0
    def test_isNeverNull(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).isNeverNull("_1").isNeverNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 is never null.
- *FAILURE*: Column _2 contains 1 row that is null (should never be null).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #11
0
    def test_hasFunctionalDependency(self):
        df = self.spark.createDataFrame([(1, 2, 1, 1), (9, 9, 9, 2),
                                         (9, 9, 9, 3)])
        check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 2 more fields]**

It has a total number of 4 columns and 3 rows.

- *SUCCESS*: Column _3 is functionally dependent on _1, _2.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #12
0
    def test_isAnyOf(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "b"), (3, "c")])
        check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2).
- *SUCCESS*: Column _2 contains only values in Set(a, b, c).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
 def test_passed_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     smtpServer = "*****@*****.**"
     to = {"*****@*****.**"}
     cc = {"*****@*****.**"}
     subjectPrefix = "my subject prefix: "
     smtpPort = 9000
     from_ = "test.ddq.io"
     usernameAndPassword = ("username", "password")
     reportOnlyOnFailure = True
     accumulatedReport = True
     reporter = EmailReporter(smtpServer, to, cc, subjectPrefix, smtpPort,
                              from_, usernameAndPassword,
                              reportOnlyOnFailure, accumulatedReport)
     check.run([reporter])
Example #14
0
    def test_isFormattedAsDate(self):
        df = self.spark.createDataFrame([("2000-11-23 11:50:10", ),
                                         ("2000-5-23 11:50:10", ),
                                         ("2000-02-23 11:11:11", )])
        check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: string]**

It has a total number of 1 columns and 3 rows.

- *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #15
0
    def test_isJoinableWith(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #16
0
    def test_hasForeignKey(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #17
0
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [_1#477L, cast(_1#477L as array<int>) AS _1_casted#516]\n+- LogicalRDD [_1#477L, _2#478]
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #18
0
    def test_isEqualTo(self):
        df1 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df2 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df3 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (9, 9, 9),
                                          (10, 10, 10)])

        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: It is equal to [_1: bigint, _2: bigint ... 1 more field].
- *FAILURE*: It is not equal (1 distinct count row is present in the checked dataframe but not in the other and 2 distinct count rows are present in the other dataframe but not in the checked one) to [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        check = Check(df1).isEqualTo(df2).isEqualTo(df3)
        check.run([self.reporter])
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #19
0
    def test_isAlwaysNull(self):
        schema = t.StructType([
            t.StructField("_1", t.IntegerType()),
            t.StructField("_2", t.StringType()),
        ])
        df = self.spark.createDataFrame([(1, None), (1, None), (3, None)],
                                        schema)
        check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: int, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 3 non-null rows (should always be null).
- *SUCCESS*: Column _2 is always null.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #20
0
    def test_passed_args(self):
        df = Mock()
        display_name = Mock()
        cache_method = Mock()
        id = Mock()

        df._sc._jvm.scala.Some.apply = Mock(
            side_effect=["Some(displayName)", "Some(cacheMethod)"]
        )
        check = Check(df, display_name, cache_method, id)
        ddq_check = check._jvm.de.frosner.ddq.core.Check

        ddq_check.assert_called_with(
            df._jdf,
            "Some(displayName)",
            "Some(cacheMethod)",
            getattr(ddq_check, "apply$default$4")(),
            id
        )
Example #21
0
    def test_isMatchingRegex(self):
        df = self.spark.createDataFrame([("Hello A", "world"),
                                         ("Hello B", None),
                                         ("Hello C", "World")])
        check = Check(df)\
                .isMatchingRegex("_1", "^Hello")\
                .isMatchingRegex("_2", "world$")

        check.run([self.reporter])
        expected_output = """
**Checking [_1: string, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 matches ^Hello
- *FAILURE*: Column _2 contains 1 row that does not match world$
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])

        # instance ids are in the output
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [
+- LogicalRDD [
""".strip()
        for actual, expected in zip(
                self.reporter.output_stream.get_output().split("\n"),
                expected_output.split("\n")):
            self.assertTrue(actual.startswith(expected))
Example #24
0
 def setUp(self):
     self.check = Check(Mock())
     self.jvmCheck = self.check.jvmCheck
 def test_default_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"})
     check.run([reporter])
Example #26
0
rawZoneInputPath = rawZoneAdlsPath + \
    'WorldWideImporters/orders/2020/04/07/*.parquet'
exporationZoneOutputPath = explorationZoneAdlsPath + \
    'deta-lake/WorldWideImporters/orders'

# Configure the Session that will connect python to spark and configure logging for spark delta store configuration
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .enableHiveSupport() \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.AzureLogStore") \
    .getOrCreate()



test = spark.read.format("delta").load(exporationZoneOutputPath)

test.dtypes
    
    


Check(test) \
    .hasNumRowsGreaterThan(0) \
    .hasUniqueKey("OrderID") \
    .isNeverNull("CustomerID") \
    .run()

 def setUp(self):
     df = get_df()
     self.check = Check(df)
     self.jvmCheck = self.check.jvmCheck