def test_accumulated_report(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"},
                              accumulatedReport=True)
     check.run([reporter])
     reporter.sendAccumulatedReport()
     reporter.sendAccumulatedReport("111")
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = ConsoleReporter(baos)
        check.run([reporter])
        expected_output = """
\x1b[34mChecking [_1: bigint, _2: string]\x1b[0m
\x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m
\x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m
\x1b[32m- Columns _1, _2 are a key.\x1b[0m
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = ConsoleReporter(baos)
        check.run([reporter])
        expected_output = """
\x1b[34mChecking [_1: bigint, _2: string]\x1b[0m
\x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m
\x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m
\x1b[32m- Columns _1, _2 are a key.\x1b[0m
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
Example #4
0
    def test_hasUniqueKey(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #5
0
    def test_satisfies(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "a"), (3, "a")])
        check = Check(df).satisfies("_1 > 0").satisfies(df._2 == 'a')
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Constraint _1 > 0 is satisfied.
- *SUCCESS*: Constraint (_2 = a) is satisfied.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_hasNumRowsLessThan(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasNumRowsLessThan(2).hasNumRowsLessThan(10)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: The actual number of rows 3 does not satisfy (count < 2).
- *SUCCESS*: The number of rows satisfies (count < 10).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #7
0
    def test_hasFunctionalDependency(self):
        df = self.spark.createDataFrame([(1, 2, 1, 1), (9, 9, 9, 2),
                                         (9, 9, 9, 3)])
        check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 2 more fields]**

It has a total number of 4 columns and 3 rows.

- *SUCCESS*: Column _3 is functionally dependent on _1, _2.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #8
0
    def test_isNeverNull(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).isNeverNull("_1").isNeverNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 is never null.
- *FAILURE*: Column _2 contains 1 row that is null (should never be null).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = MarkdownReporter(baos)
        check.run([reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
Example #10
0
    def test_isAnyOf(self):
        df = self.spark.createDataFrame([(1, "a"), (2, "b"), (3, "c")])
        check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2).
- *SUCCESS*: Column _2 contains only values in Set(a, b, c).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_output(self):
        check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        baos = ByteArrayOutputStream()
        reporter = MarkdownReporter(baos)
        check.run([reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(baos.get_output(), expected_output)
 def test_passed_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     smtpServer = "*****@*****.**"
     to = {"*****@*****.**"}
     cc = {"*****@*****.**"}
     subjectPrefix = "my subject prefix: "
     smtpPort = 9000
     from_ = "test.ddq.io"
     usernameAndPassword = ("username", "password")
     reportOnlyOnFailure = True
     accumulatedReport = True
     reporter = EmailReporter(smtpServer, to, cc, subjectPrefix, smtpPort,
                              from_, usernameAndPassword,
                              reportOnlyOnFailure, accumulatedReport)
     check.run([reporter])
Example #13
0
    def test_isFormattedAsDate(self):
        df = self.spark.createDataFrame([("2000-11-23 11:50:10", ),
                                         ("2000-5-23 11:50:10", ),
                                         ("2000-02-23 11:11:11", )])
        check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: string]**

It has a total number of 1 columns and 3 rows.

- *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
 def test_passed_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     smtpServer = "*****@*****.**"
     to = {"*****@*****.**"}
     cc = {"*****@*****.**"}
     subjectPrefix = "my subject prefix: "
     smtpPort = 9000
     from_ = "test.ddq.io"
     usernameAndPassword = ("username", "password")
     reportOnlyOnFailure = True
     accumulatedReport = True
     reporter = EmailReporter(
         smtpServer, to, cc, subjectPrefix, smtpPort, from_,
         usernameAndPassword, reportOnlyOnFailure, accumulatedReport
     )
     check.run([reporter])
    def test_hasUniqueKey(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 is not a key (1 non-unique tuple).
- *SUCCESS*: Columns _1, _2 are a key.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isAnyOf(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (2, "b"), (3, "c")])
        check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2).
- *SUCCESS*: Column _2 contains only values in Set(a, b, c).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #17
0
    def test_isJoinableWith(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%).
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #18
0
    def test_hasForeignKey(self):
        base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isNeverNull(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df).isNeverNull("_1").isNeverNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 is never null.
- *FAILURE*: Column _2 contains 1 row that is null (should never be null).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #20
0
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [_1#477L, cast(_1#477L as array<int>) AS _1_casted#516]\n+- LogicalRDD [_1#477L, _2#478]
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #21
0
    def test_isEqualTo(self):
        df1 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df2 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)])
        df3 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (9, 9, 9),
                                          (10, 10, 10)])

        expected_output = """
**Checking [_1: bigint, _2: bigint ... 1 more field]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: It is equal to [_1: bigint, _2: bigint ... 1 more field].
- *FAILURE*: It is not equal (1 distinct count row is present in the checked dataframe but not in the other and 2 distinct count rows are present in the other dataframe but not in the checked one) to [_1: bigint, _2: bigint ... 1 more field].
""".strip()
        check = Check(df1).isEqualTo(df2).isEqualTo(df3)
        check.run([self.reporter])
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isConvertibleTo(self):
        df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve 'cast(_1 as array<int>)' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_satisfies(self):
        df = self.sqlContext.createDataFrame([
            (1, "a"), (2, "a"), (3, "a")
        ])
        check = Check(df).satisfies("_1 > 0").satisfies("_2 = 'a'")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Constraint _1 > 0 is satisfied.
- *SUCCESS*: Constraint _2 = 'a' is satisfied.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_hasFunctionalDependency(self):
        df = self.sqlContext.createDataFrame([
            (1, 2, 1, 1),
            (9, 9, 9, 2),
            (9, 9, 9, 3)
        ])
        check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"])
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint, _4: bigint]**

It has a total number of 4 columns and 3 rows.

- *SUCCESS*: Column _3 is functionally dependent on _1, _2.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isFormattedAsDate(self):
        df = self.sqlContext.createDataFrame([
            ("2000-11-23 11:50:10", ),
            ("2000-5-23 11:50:10", ),
            ("2000-02-23 11:11:11", )
        ])
        check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: string]**

It has a total number of 1 columns and 3 rows.

- *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
Example #26
0
    def test_isAlwaysNull(self):
        schema = t.StructType([
            t.StructField("_1", t.IntegerType()),
            t.StructField("_2", t.StringType()),
        ])
        df = self.spark.createDataFrame([(1, None), (1, None), (3, None)],
                                        schema)
        check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: int, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 3 non-null rows (should always be null).
- *SUCCESS*: Column _2 is always null.
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
Example #27
0
    def test_isMatchingRegex(self):
        df = self.spark.createDataFrame([("Hello A", "world"),
                                         ("Hello B", None),
                                         ("Hello C", "World")])
        check = Check(df)\
                .isMatchingRegex("_1", "^Hello")\
                .isMatchingRegex("_2", "world$")

        check.run([self.reporter])
        expected_output = """
**Checking [_1: string, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 matches ^Hello
- *FAILURE*: Column _2 contains 1 row that does not match world$
""".strip()
        self.assertEqual(self.reporter.output_stream.get_output(),
                         expected_output)
    def test_isConvertibleTo(self):
        df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")])
        check = Check(df)\
                .isConvertibleTo("_1", t.IntegerType())\
                .isConvertibleTo("_1", t.ArrayType(t.IntegerType()))
        check.run([self.reporter])

        # instance ids are in the output
        expected_output = """
**Checking [_1: bigint, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 can be converted from LongType to IntegerType.
- *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);;
'Project [
+- LogicalRDD [
""".strip()
        for actual, expected in zip(
                self.reporter.output_stream.get_output().split("\n"),
                expected_output.split("\n")):
            self.assertTrue(actual.startswith(expected))
    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
    def test_hasForeignKey(self):
        base = self.sqlContext.createDataFrame([
            (1, 2, 3), (1, 2, 5), (1, 3, 3)
        ])
        ref = self.sqlContext.createDataFrame([
            (1, 2, 100), (1, 3, 100)
        ])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint, _3: bigint].
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isJoinableWith(self):
        base = self.sqlContext.createDataFrame([
            (1, 2, 3), (1, 2, 5), (1, 3, 3)
        ])
        ref = self.sqlContext.createDataFrame([
            (1, 2, 100), (1, 3, 100)
        ])
        columnTuple1 = ("_1", "_1")
        columnTuple2 = ("_2", "_2")
        check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2)
        check.run([self.reporter])
        expected_output = """
**Checking [_1: bigint, _2: bigint, _3: bigint]**

It has a total number of 3 columns and 3 rows.

- *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%).
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
    def test_isMatchingRegex(self):
        df = self.sqlContext.createDataFrame([
            ("Hello A", "world"),
            ("Hello B", None),
            ("Hello C", "World")
        ])
        check = Check(df)\
                .isMatchingRegex("_1", "^Hello")\
                .isMatchingRegex("_2", "world$")

        check.run([self.reporter])
        expected_output = """
**Checking [_1: string, _2: string]**

It has a total number of 2 columns and 3 rows.

- *SUCCESS*: Column _1 matches ^Hello
- *FAILURE*: Column _2 contains 1 row that does not match world$
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
    def test_isAlwaysNull(self):
        schema = t.StructType([
            t.StructField("_1", t.IntegerType()),
            t.StructField("_2", t.StringType()),
        ])
        df = self.sqlContext.createDataFrame(
            [(1, None), (1, None), (3, None)],
            schema
        )
        check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2")
        check.run([self.reporter])
        expected_output = """
**Checking [_1: int, _2: string]**

It has a total number of 2 columns and 3 rows.

- *FAILURE*: Column _1 contains 3 non-null rows (should always be null).
- *SUCCESS*: Column _2 is always null.
""".strip()
        self.assertEqual(
            self.reporter.output_stream.get_output(),
            expected_output
        )
 def test_default_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"})
     check.run([reporter])
 def test_default_arguments(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"})
     check.run([reporter])
 def test_accumulated_report(self):
     check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
     reporter = EmailReporter("*****@*****.**", {"*****@*****.**"}, accumulatedReport=True)
     check.run([reporter])
     reporter.sendAccumulatedReport()
     reporter.sendAccumulatedReport("111")