Beispiel #1
0
 def setUpClass(cls):
     deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"  # TODO: get Maven Coord from Configs
     f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"  # This package is excluded because it causes an error in the SparkSession fig
     cls.spark = (SparkSession.builder.master('local[*]').config(
         "spark.executor.memory",
         "2g").config("spark.jars.packages", deequ_maven_coord).config(
             "spark.pyspark.python", "/usr/bin/python3").config(
                 "spark.pyspark.driver.python", "/usr/bin/python3").config(
                     "spark.jars.excludes", f2j_maven_coord).config(
                         "spark.driver.extraJavaOptions",
                         "-XX:+UseG1GC").config(
                             "spark.executor.extraJavaOptions",
                             "-XX:+UseG1GC").config(
                                 "spark.sql.autoBroadcastJoinThreshold",
                                 "-1").appName(
                                     'test-analyzers-local').getOrCreate())
     cls.pydeequ_session = PyDeequSession(cls.spark)
     cls.AnalysisRunner = cls.pydeequ_session.createAnalysisRunner()
     cls.ColumnProfilerRunner = ColumnProfilerRunner(cls.spark)
     cls.ConstraintSuggestionRunner = ConstraintSuggestionRunner(cls.spark)
     cls.sc = cls.spark.sparkContext
     data = [('foo', 1, True, 1.0, float('nan')),
             ('bar', 2, False, 2.0, float('nan'))]
     cls.pyspark_df = cls.spark.createDataFrame(
         data, schema=['strings', 'ints', 'bools', 'floats', 'nans'])
     cls.pandas_df = pandasDF({
         'strings': ['foo', 'bar'],
         'ints': [1, 2],
         'bools': [True, False],
         'floats': [1.0, 2.0],
         'nans': [np.nan, np.nan]
     })
Beispiel #2
0
    def test_profile_run(self):
        result = ColumnProfilerRunner(self.spark).onData(self.df).run()
        for col, profile in result.profiles.items():
            print(profile)
            print(f"col: {col} -> profile: {profile}")

        print("Results: ", result)
        print(result.profiles["a"].column, result.profiles["a"].completeness)
Beispiel #3
0
 def test_setPredefinedTypes(self):
     result = ColumnProfilerRunner(self.spark) \
         .onData(self.df) \
         .setPredefinedTypes({'a': DataTypeInstances.Unknown, 'b': DataTypeInstances.String, 'c': DataTypeInstances.Fractional}) \
         .run()
     print(result)
     for col, profile in result.profiles.items():
         print("Profiles:", profile)
Beispiel #4
0
    def test_profile_run(self):
        result = ColumnProfilerRunner(self.spark) \
            .onData(self.df) \
            .run()
        for col, profile in result.profiles.items():
            print(profile)

        print(result.profiles['a'].column, result.profiles['a'].completeness)
Beispiel #5
0
    def test_kll_and_approxPercentiles(self):
        result = (ColumnProfilerRunner(self.spark).onData(
            self.df).withKLLProfiling().setKLLParameters(
                KLLParameters(self.spark, 2, 0.64, 2)).run())
        for col, profile in result.profiles.items():
            print(f"col: {col} -> profile: {profile}")

        self.assertEqual(result.profiles["b"].kll.apply(1).lowValue, 2.0)
        self.assertEqual(result.profiles["b"].kll.apply(1).highValue, 3.0)
        self.assertEqual(result.profiles["b"].kll.apply(1).count, 2)
        self.assertEqual(result.profiles["b"].kll.argmax, 1)
        self.assertIn(1.0, result.profiles["b"].approxPercentiles)
        self.assertIn(2.0, result.profiles["b"].approxPercentiles)
        self.assertIn(3.0, result.profiles["b"].approxPercentiles)
 def test_spark_session_type_exception(self):
     try:
         ColumnProfilerRunner("foo")
         raise Exception("Did not raise TypeError")
     except TypeError:
         pass
     try:
         ColumnProfilerRunBuilder(5, self.df)
         raise Exception("Did not raise TypeError")
     except TypeError:
         pass
     try:
         ColumnProfilerRunBuilder(self.spark, "fail")
         raise Exception("Did not raise TypeError")
     except TypeError:
         pass
Beispiel #7
0
    def test_kll_and_approxPercentiles(self):
        result = ColumnProfilerRunner(self.spark) \
            .onData(self.df) \
            .withKLLProfiling() \
            .setKLLParameters(KLLParameters(self.spark, 2, 0.64, 2)) \
            .run()
        for col, profile in result.profiles.items():
            print(profile)

        self.assertEqual(result.profiles['b'].kll.apply(1).lowValue, 2.0)
        self.assertEqual(result.profiles['b'].kll.apply(1).highValue, 3.0)
        self.assertEqual(result.profiles['b'].kll.apply(1).count, 2)
        self.assertEqual(result.profiles['b'].kll.argmax, 1)
        self.assertIn(1.0, result.profiles['b'].approxPercentiles)
        self.assertIn(2.0, result.profiles['b'].approxPercentiles)
        self.assertIn(3.0, result.profiles['b'].approxPercentiles)
 def setUpClass(cls):
     cls.spark = setup_pyspark().appName("test-analyzers-local").getOrCreate()
     cls.pydeequ_session = PyDeequSession(cls.spark)
     cls.AnalysisRunner = cls.pydeequ_session.createAnalysisRunner()
     cls.ColumnProfilerRunner = ColumnProfilerRunner(cls.spark)
     cls.ConstraintSuggestionRunner = ConstraintSuggestionRunner(cls.spark)
     cls.sc = cls.spark.sparkContext
     data = [("foo", 1, True, 1.0, float("nan")), ("bar", 2, False, 2.0, float("nan"))]
     cls.pyspark_df = cls.spark.createDataFrame(data, schema=["strings", "ints", "bools", "floats", "nans"])
     cls.pandas_df = pandasDF(
         {
             "strings": ["foo", "bar"],
             "ints": [1, 2],
             "bools": [True, False],
             "floats": [1.0, 2.0],
             "nans": [np.nan, np.nan],
         }
     )
Beispiel #9
0
 def createColumnProfileRunner(self):
     return ColumnProfilerRunner(self._spark_session)