def test_accumulated_report(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"},
                              accumulatedReport=True)
     check.run([reporter])
     reporter.sendAccumulatedReport()
     reporter.sendAccumulatedReport("111")
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = ConsoleReporter(baos)
        check.run([reporter])
        expected_output = """
\x1b[34mChecking [_1: bigint, _2: string]\x1b[0m
\x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m
\x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m
\x1b[32m- Columns _1, _2 are a key.\x1b[0m
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = ConsoleReporter(baos)
        check.run([reporter])
        expected_output = """
\x1b[34mChecking [_1: bigint, _2: string]\x1b[0m
\x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m
\x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m
\x1b[32m- Columns _1, _2 are a key.\x1b[0m
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = MarkdownReporter(baos)
        check.run([reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
Example #5
0
    def test_satisfies(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "a"), (3, "a")])
        check = Check(df).satisfies("_1 > 0").satisfies(df._2 == 'a')
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Constraint _1 > 0 is satisfied.
- *SUCCESS*: Constraint (_2 = a) is satisfied.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #6
0
    def test_hasUniqueKey(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #7
0
    def test_isAnyOf(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "b"), (3, "c")])
        check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2).
- *SUCCESS*: Column _2 contains only values in Set(a, b, c).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #8
0
    def test_isNeverNull(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).isNeverNull("_1").isNeverNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 is never null.
- *FAILURE*: Column _2 contains 1 row that is null (should never be null).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = MarkdownReporter(baos)
        check.run([reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
    def test_hasNumRowsLessThan(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasNumRowsLessThan(2).hasNumRowsLessThan(10)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: The actual number of rows 3 does not satisfy (count < 2).
- *SUCCESS*: The number of rows satisfies (count < 10).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #11
0
    def test_hasFunctionalDependency(self):
        df = self.spark.createDataFrame([(1, 2, 1, 1), (9, 9, 9, 2),
                                         (9, 9, 9, 3)])
        check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 2 more fields]**

It has a total number of 4 columns and 3 rows.

- *SUCCESS*: Column _3 is functionally dependent on _1, _2.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
 def test_passed_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     smtpServer = "*****@*****.**"
     to = {"*****@*****.**"}
     cc = {"*****@*****.**"}
     subjectPrefix = "my subject prefix: "
     smtpPort = 9000
     from_ = "test.ddq.io"
     usernameAndPassword = ("username", "password")
     reportOnlyOnFailure = True
     accumulatedReport = True
     reporter = EmailReporter(smtpServer, to, cc, subjectPrefix, smtpPort,
                              from_, usernameAndPassword,
                              reportOnlyOnFailure, accumulatedReport)
     check.run([reporter])
Example #13
0
    def test_isFormattedAsDate(self):
        df = self.spark.createDataFrame([("2000-11-23 11:50:10", ),
                                         ("2000-5-23 11:50:10", ),
                                         ("2000-02-23 11:11:11", )])
        check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: string]**

It has a total number of 1 columns and 3 rows.

- *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #14
0
    def test_isJoinableWith(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #15
0
    def test_hasForeignKey(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isNeverNull(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).isNeverNull("_1").isNeverNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 is never null.
- *FAILURE*: Column _2 contains 1 row that is null (should never be null).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
 def test_passed_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     smtpServer = "*****@*****.**"
     to = {"*****@*****.**"}
     cc = {"*****@*****.**"}
     subjectPrefix = "my subject prefix: "
     smtpPort = 9000
     from_ = "test.ddq.io"
     usernameAndPassword = ("username", "password")
     reportOnlyOnFailure = True
     accumulatedReport = True
     reporter = EmailReporter(
         smtpServer, to, cc, subjectPrefix, smtpPort, from_,
         usernameAndPassword, reportOnlyOnFailure, accumulatedReport
     )
     check.run([reporter])
Example #18
0
    def test_passed_args(self):
        display_name = "display name"
        id = "id"
        cache_method = StorageLevel.DISK_ONLY
        check = Check(self.df, display_name, cache_method, id)

        # check wrapper
        self.assertEqual(check.name, display_name)
        self.assertEqual(check.id, id)
        self.assertEqual(check.cacheMethod, cache_method)

        # check jvm check
        self.assertEqual(check.jvmCheck.getClass().toString(),
                         "class de.frosner.ddq.core.Check")
        self.assertEqual(check.jvmCheck.name(), check.name)
        self.assertEqual(check.jvmCheck.id(), check.id)
        jvm_cache_method = check.jvmCheck.cacheMethod().get()
        self.assertEqual(jvm_cache_method.useDisk(), check.cacheMethod.useDisk)
        self.assertEqual(jvm_cache_method.useMemory(),
                         check.cacheMethod.useMemory)
        self.assertEqual(jvm_cache_method.useOffHeap(),
                         check.cacheMethod.useOffHeap)
        self.assertEqual(jvm_cache_method.deserialized(),
                         check.cacheMethod.deserialized)
        self.assertEqual(jvm_cache_method.replication(),
                         check.cacheMethod.replication)
    def test_isAnyOf(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (2, "b"), (3, "c")])
        check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2).
- *SUCCESS*: Column _2 contains only values in Set(a, b, c).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_hasUniqueKey(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #21
0
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [_1#477L, cast(_1#477L as array<int>) AS _1_casted#516]\n+- LogicalRDD [_1#477L, _2#478]
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isConvertibleTo(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve 'cast(_1 as array<int>)' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_satisfies(self):
        df = self.sqlContext.createDataFrame([
            (1, "a"), (2, "a"), (3, "a")
        ])
        check = Check(df).satisfies("_1 > 0").satisfies("_2 = 'a'")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Constraint _1 > 0 is satisfied.
- *SUCCESS*: Constraint _2 = 'a' is satisfied.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #24
0
    def test_isEqualTo(self):
        df1 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df2 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df3 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (9, 9, 9),
                                          (10, 10, 10)])

        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: It is equal to [_1: bigint, _2: bigint ... 1 more field].
- *FAILURE*: It is not equal (1 distinct count row is present in the checked dataframe but not in the other and 2 distinct count rows are present in the other dataframe but not in the checked one) to [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        check = Check(df1).isEqualTo(df2).isEqualTo(df3)
        check.run([self.reporter])
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_hasFunctionalDependency(self):
        df = self.sqlContext.createDataFrame([
            (1, 2, 1, 1),
            (9, 9, 9, 2),
            (9, 9, 9, 3)
        ])
        check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint, _4: bigint]**

It has a total number of 4 columns and 3 rows.

- *SUCCESS*: Column _3 is functionally dependent on _1, _2.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #26
0
    def test_isAlwaysNull(self):
        schema = t.StructType([
            t.StructField("_1", t.IntegerType()),
            t.StructField("_2", t.StringType()),
        ])
        df = self.spark.createDataFrame([(1, None), (1, None), (3, None)],
                                        schema)
        check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: int, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 3 non-null rows (should always be null).
- *SUCCESS*: Column _2 is always null.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #27
0
    def test_isMatchingRegex(self):
        df = self.spark.createDataFrame([("Hello A", "world"),
                                         ("Hello B", None),
                                         ("Hello C", "World")])
        check = Check(df)\
                .isMatchingRegex("_1", "^Hello")\
                .isMatchingRegex("_2", "world$")

        check.run([self.reporter])
        expected_output = """
**Checking [_1: string, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 matches ^Hello
- *FAILURE*: Column _2 contains 1 row that does not match world$
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isFormattedAsDate(self):
        df = self.sqlContext.createDataFrame([
            ("2000-11-23 11:50:10", ),
            ("2000-5-23 11:50:10", ),
            ("2000-02-23 11:11:11", )
        ])
        check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: string]**

It has a total number of 1 columns and 3 rows.

- *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #29
0
 def test_default_args(self):
     df = Mock()
     check = Check(df)
     ddq_check = check._jvm.de.frosner.ddq.core.Check
     ddq_check.assert_called_with(
         df._jdf,
         getattr(ddq_check, "apply$default$2")(),
         getattr(ddq_check, "apply$default$3")(),
         getattr(ddq_check, "apply$default$4")(),
         getattr(ddq_check, "apply$default$5")(),
     )
Example #30
0
    def test_default_args(self):
        check = Check(self.df)
        self.assertEqual(check.name, "DataFrame[_1: bigint, _2: string]")
        self.assertEqual(check.cacheMethod, None)
        try:
            UUID(check.id, version=4)
        except ValueError:
            raise self.fail("id is not a correct uuid4")

        self.assertEqual(check.jvmCheck.getClass().toString(),
                         "class de.frosner.ddq.core.Check")
    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
    def test_isJoinableWith(self):
        base = self.sqlContext.createDataFrame([
            (1, 2, 3), (1, 2, 5), (1, 3, 3)
        ])
        ref = self.sqlContext.createDataFrame([
            (1, 2, 100), (1, 3, 100)
        ])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])

        # instance ids are in the output
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [
+- LogicalRDD [
""".strip()
        for actual, expected in zip(
                self.reporter.output_stream.get_output().split("\n"),
                expected_output.split("\n")):
            self.assertTrue(actual.startswith(expected))
    def test_hasForeignKey(self):
        base = self.sqlContext.createDataFrame([
            (1, 2, 3), (1, 2, 5), (1, 3, 3)
        ])
        ref = self.sqlContext.createDataFrame([
            (1, 2, 100), (1, 3, 100)
        ])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint, _3: bigint].
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isAlwaysNull(self):
        schema = t.StructType([
            t.StructField("_1", t.IntegerType()),
            t.StructField("_2", t.StringType()),
        ])
        df = self.sqlContext.createDataFrame(
            [(1, None), (1, None), (3, None)],
            schema
        )
        check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: int, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 3 non-null rows (should always be null).
- *SUCCESS*: Column _2 is always null.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isMatchingRegex(self):
        df = self.sqlContext.createDataFrame([
            ("Hello A", "world"),
            ("Hello B", None),
            ("Hello C", "World")
        ])
        check = Check(df)\
                .isMatchingRegex("_1", "^Hello")\
                .isMatchingRegex("_2", "world$")

        check.run([self.reporter])
        expected_output = """
**Checking [_1: string, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 matches ^Hello
- *FAILURE*: Column _2 contains 1 row that does not match world$
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #38
0
    def test_passed_args(self):
        df = Mock()
        display_name = Mock()
        cache_method = Mock()
        id = Mock()

        df._sc._jvm.scala.Some.apply = Mock(
            side_effect=["Some(displayName)", "Some(cacheMethod)"]
        )
        check = Check(df, display_name, cache_method, id)
        ddq_check = check._jvm.de.frosner.ddq.core.Check

        ddq_check.assert_called_with(
            df._jdf,
            "Some(displayName)",
            "Some(cacheMethod)",
            getattr(ddq_check, "apply$default$4")(),
            id
        )
 def test_default_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"})
     check.run([reporter])
 def setUp(self):
     self.check = Check(Mock())
     self.jvmCheck = self.check.jvmCheck
Example #41
0
 def setUp(self):
     self.check = Check(Mock())
     self.jvmCheck = self.check.jvmCheck
Example #42
0
rawZoneInputPath = rawZoneAdlsPath + \
    'WorldWideImporters/orders/2020/04/07/*.parquet'
exporationZoneOutputPath = explorationZoneAdlsPath + \
    'deta-lake/WorldWideImporters/orders'

# Configure the Session that will connect python to spark and configure logging for spark delta store configuration
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .enableHiveSupport() \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.AzureLogStore") \
    .getOrCreate()



test = spark.read.format("delta").load(exporationZoneOutputPath)

test.dtypes
    
    


Check(test) \
    .hasNumRowsGreaterThan(0) \
    .hasUniqueKey("OrderID") \
    .isNeverNull("CustomerID") \
    .run()

 def setUp(self):
     df = get_df()
     self.check = Check(df)
     self.jvmCheck = self.check.jvmCheck
class ConstraintTest(unittest.TestCase):
    COLUMN_NAME = "column name"

    def setUp(self):
        df = get_df()
        self.check = Check(df)
        self.jvmCheck = self.check.jvmCheck

    def test_hasUniqueKey(self):
        column_names = ["a", "b"]
        jvm_column_names = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=jvm_column_names
        )
        self.check.hasUniqueKey(self.COLUMN_NAME, column_names)
        self.jvmCheck.hasUniqueKey.assert_called_with(
            self.COLUMN_NAME, jvm_column_names
        )

    def test_hasNumRowsEqualTo(self):
        num_rows = 10
        self.check.hasNumRowsEqualTo(num_rows)
        self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\
            equalTo.assert_called_with(num_rows)

    def test_hasNumRowsGreaterThan(self):
        num_rows = 10
        self.check.hasNumRowsGreaterThan(num_rows)
        self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\
            greaterThan.assert_called_with(num_rows)

    def test_hasNumRowsLessThan(self):
        num_rows = 10
        self.check.hasNumRowsLessThan(num_rows)
        self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\
            lessThan.assert_called_with(num_rows)

    def test_isNeverNull(self):
        self.check.isNeverNull(self.COLUMN_NAME)
        self.jvmCheck.isNeverNull.assert_called_with(self.COLUMN_NAME)

    def test_isAlwaysNull(self):
        self.check.isAlwaysNull(self.COLUMN_NAME)
        self.jvmCheck.isAlwaysNull.assert_called_with(self.COLUMN_NAME)

    def test_isConvertibleTo(self):
        target_type = Mock()
        target_type.json = Mock(return_value="json value")
        jvm_type = Mock()
        self.check._jvm.org.apache.spark.sql.types.DataType.fromJson = Mock(
            return_value=jvm_type
        )

        self.check.isConvertibleTo(self.COLUMN_NAME, target_type)

        target_type.json.assert_called()
        self.check._jvm.org.apache.spark.sql.types.DataType.fromJson.\
            assert_called_with("json value")
        self.jvmCheck.isConvertibleTo.assert_called_with(
            self.COLUMN_NAME,
            jvm_type
        )

    def test_isFormattedAsDate(self):
        date_format = "yyyy-MM-dd HH:mm:ss"
        self.check.isFormattedAsDate(self.COLUMN_NAME, date_format)
        self.jvmCheck.isFormattedAsDate.assert_called_with(self.COLUMN_NAME,
                                                           date_format)

    def test_isAnyOf(self):
        allowed = ("a", "b", "c")
        jvm_allowed = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toSet = Mock(
                return_value=jvm_allowed
        )
        self.check.isAnyOf(self.COLUMN_NAME, allowed)
        self.jvmCheck.isAnyOf.assert_called_with(self.COLUMN_NAME, jvm_allowed)

    def test_isMatchingRegex(self):
        regex = "^regex$"
        self.check.isMatchingRegex(self.COLUMN_NAME, regex)
        self.jvmCheck.isMatchingRegex.assert_called_with(self.COLUMN_NAME, regex)

    def test_hasFunctionalDepdendency(self):
        determinant_set = ["column1", "column2"]
        dependent_set = ["column3", "column4"]

        jvm_determinant_set = Mock()
        jvm_dependent_set = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                side_effect=[jvm_determinant_set, jvm_dependent_set]
        )

        self.check.hasFunctionalDependency(determinant_set, dependent_set)
        self.jvmCheck.hasFunctionalDependency.assert_called_with(
            jvm_determinant_set, jvm_dependent_set
        )

    def test_hasForeignKey(self):
        key_map1 = ("_1", "_1")
        key_map2 = ("_1", "_2")

        ref = Mock()
        jvm_key_map1 = Mock()
        jvm_key_map2 = Mock()

        self.check._jvm.scala.Tuple2 = Mock(
            side_effect=[jvm_key_map1, jvm_key_map2]
        )
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=[jvm_key_map2]
        )
        self.check.hasForeignKey(ref, key_map1, key_map2)
        self.jvmCheck.hasForeignKey.assert_called_with(
            ref._jdf, jvm_key_map1, [jvm_key_map2]
        )

    def test_isJoinableWith(self):
        key_map1 = ("_1", "_1")
        key_map2 = ("_1", "_2")

        ref = Mock()
        jvm_key_map1 = Mock()
        jvm_key_map2 = Mock()

        self.check._jvm.scala.Tuple2 = Mock(
            side_effect=[jvm_key_map1, jvm_key_map2]
        )
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=[jvm_key_map2]
        )
        self.check.isJoinableWith(ref, key_map1, key_map2)
        self.jvmCheck.isJoinableWith.assert_called_with(
            ref._jdf, jvm_key_map1, [jvm_key_map2]
        )

    def test_satisfies(self):
        constraint = "_1 > 10"
        self.check.satisfies(constraint)
        self.jvmCheck.satisfies.assert_called_with(constraint)

    def test_isEqualTo(self):
        df2 = Mock()
        self.check.isEqualTo(df2)
        self.jvmCheck.isEqualTo.assert_called_with(df2._jdf)
 def test_default_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"})
     check.run([reporter])
 def setUp(self):
     df = get_df()
     self.check = Check(df)
     self.jvmCheck = self.check.jvmCheck
class ConstraintTest(unittest.TestCase):
    COLUMN_NAME = "column name"

    def setUp(self):
        df = get_df()
        self.check = Check(df)
        self.jvmCheck = self.check.jvmCheck

    def test_hasUniqueKey(self):
        column_names = ["a", "b"]
        jvm_column_names = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=jvm_column_names
        )
        self.check.hasUniqueKey(self.COLUMN_NAME, column_names)
        self.jvmCheck.hasUniqueKey.assert_called_with(
            self.COLUMN_NAME, jvm_column_names
        )

    def test_isNeverNull(self):
        self.check.isNeverNull(self.COLUMN_NAME)
        self.jvmCheck.isNeverNull.assert_called_with(self.COLUMN_NAME)

    def test_isAlwaysNull(self):
        self.check.isAlwaysNull(self.COLUMN_NAME)
        self.jvmCheck.isAlwaysNull.assert_called_with(self.COLUMN_NAME)

    def test_isConvertibleTo(self):
        target_type = Mock()
        target_type.json = Mock(return_value="json value")
        jvm_type = Mock()
        self.check._jvm.org.apache.spark.sql.types.DataType.fromJson = Mock(
            return_value=jvm_type
        )

        self.check.isConvertibleTo(self.COLUMN_NAME, target_type)

        target_type.json.assert_called()
        self.check._jvm.org.apache.spark.sql.types.DataType.fromJson.\
            assert_called_with("json value")
        self.jvmCheck.isConvertibleTo.assert_called_with(
            self.COLUMN_NAME,
            jvm_type
        )

    def test_isFormattedAsDate(self):
        date_format = "yyyy-MM-dd HH:mm:ss"
        self.check.isFormattedAsDate(self.COLUMN_NAME, date_format)
        self.jvmCheck.isFormattedAsDate.assert_called_with(self.COLUMN_NAME,
                                                           date_format)

    def test_isAnyOf(self):
        allowed = ("a", "b", "c")
        jvm_allowed = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toSet = Mock(
                return_value=jvm_allowed
        )
        self.check.isAnyOf(self.COLUMN_NAME, allowed)
        self.jvmCheck.isAnyOf.assert_called_with(self.COLUMN_NAME, jvm_allowed)

    def test_isMatchingRegex(self):
        regex = "^regex$"
        self.check.isMatchingRegex(self.COLUMN_NAME, regex)
        self.jvmCheck.isMatchingRegex.assert_called_with(self.COLUMN_NAME, regex)

    def test_hasFunctionalDepdendency(self):
        determinant_set = ["column1", "column2"]
        dependent_set = ["column3", "column4"]

        jvm_determinant_set = Mock()
        jvm_dependent_set = Mock()
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                side_effect=[jvm_determinant_set, jvm_dependent_set]
        )

        self.check.hasFunctionalDependency(determinant_set, dependent_set)
        self.jvmCheck.hasFunctionalDependency.assert_called_with(
            jvm_determinant_set, jvm_dependent_set
        )

    def test_hasForeignKey(self):
        key_map1 = ("_1", "_1")
        key_map2 = ("_1", "_2")

        ref = Mock()
        jvm_key_map1 = Mock()
        jvm_key_map2 = Mock()

        self.check._jvm.scala.Tuple2 = Mock(
            side_effect=[jvm_key_map1, jvm_key_map2]
        )
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=[jvm_key_map2]
        )
        self.check.hasForeignKey(ref, key_map1, key_map2)
        self.jvmCheck.hasForeignKey.assert_called_with(
            ref._jdf, jvm_key_map1, [jvm_key_map2]
        )

    def test_isJoinableWith(self):
        key_map1 = ("_1", "_1")
        key_map2 = ("_1", "_2")

        ref = Mock()
        jvm_key_map1 = Mock()
        jvm_key_map2 = Mock()

        self.check._jvm.scala.Tuple2 = Mock(
            side_effect=[jvm_key_map1, jvm_key_map2]
        )
        self.check._jvm.scala.collection.JavaConversions.\
            iterableAsScalaIterable().toList = Mock(
                return_value=[jvm_key_map2]
        )
        self.check.isJoinableWith(ref, key_map1, key_map2)
        self.jvmCheck.isJoinableWith.assert_called_with(
            ref._jdf, jvm_key_map1, [jvm_key_map2]
        )

    def test_satisfies(self):
        constraint = "_1 > 10"
        self.check.satisfies(constraint)
        self.jvmCheck.satisfies.assert_called_with(constraint)

    def test_isEqualTo(self):
        df2 = Mock()
        self.check.isEqualTo(df2)
        self.jvmCheck.isEqualTo.assert_called_with(df2._jdf)
 def test_accumulated_report(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"}, accumulatedReport=True)
     check.run([reporter])
     reporter.sendAccumulatedReport()
     reporter.sendAccumulatedReport("111")