def setUpClass(cls): deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3" # TODO: get Maven Coord from Configs f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig cls.spark = (SparkSession.builder.master('local[*]').config( "spark.executor.memory", "2g").config("spark.jars.packages", deequ_maven_coord).config( "spark.pyspark.python", "/usr/bin/python3").config( "spark.pyspark.driver.python", "/usr/bin/python3").config( "spark.jars.excludes", f2j_maven_coord).config( "spark.driver.extraJavaOptions", "-XX:+UseG1GC").config( "spark.executor.extraJavaOptions", "-XX:+UseG1GC").config( "spark.sql.autoBroadcastJoinThreshold", "-1").appName( 'test-analyzers-local').getOrCreate()) cls.pydeequ_session = PyDeequSession(cls.spark) cls.AnalysisRunner = cls.pydeequ_session.createAnalysisRunner() cls.ColumnProfilerRunner = ColumnProfilerRunner(cls.spark) cls.ConstraintSuggestionRunner = ConstraintSuggestionRunner(cls.spark) cls.sc = cls.spark.sparkContext data = [('foo', 1, True, 1.0, float('nan')), ('bar', 2, False, 2.0, float('nan'))] cls.pyspark_df = cls.spark.createDataFrame( data, schema=['strings', 'ints', 'bools', 'floats', 'nans']) cls.pandas_df = pandasDF({ 'strings': ['foo', 'bar'], 'ints': [1, 2], 'bools': [True, False], 'floats': [1.0, 2.0], 'nans': [np.nan, np.nan] })
def test_profile_run(self): result = ColumnProfilerRunner(self.spark).onData(self.df).run() for col, profile in result.profiles.items(): print(profile) print(f"col: {col} -> profile: {profile}") print("Results: ", result) print(result.profiles["a"].column, result.profiles["a"].completeness)
def test_setPredefinedTypes(self): result = ColumnProfilerRunner(self.spark) \ .onData(self.df) \ .setPredefinedTypes({'a': DataTypeInstances.Unknown, 'b': DataTypeInstances.String, 'c': DataTypeInstances.Fractional}) \ .run() print(result) for col, profile in result.profiles.items(): print("Profiles:", profile)
def test_profile_run(self): result = ColumnProfilerRunner(self.spark) \ .onData(self.df) \ .run() for col, profile in result.profiles.items(): print(profile) print(result.profiles['a'].column, result.profiles['a'].completeness)
def test_kll_and_approxPercentiles(self): result = (ColumnProfilerRunner(self.spark).onData( self.df).withKLLProfiling().setKLLParameters( KLLParameters(self.spark, 2, 0.64, 2)).run()) for col, profile in result.profiles.items(): print(f"col: {col} -> profile: {profile}") self.assertEqual(result.profiles["b"].kll.apply(1).lowValue, 2.0) self.assertEqual(result.profiles["b"].kll.apply(1).highValue, 3.0) self.assertEqual(result.profiles["b"].kll.apply(1).count, 2) self.assertEqual(result.profiles["b"].kll.argmax, 1) self.assertIn(1.0, result.profiles["b"].approxPercentiles) self.assertIn(2.0, result.profiles["b"].approxPercentiles) self.assertIn(3.0, result.profiles["b"].approxPercentiles)
def test_spark_session_type_exception(self): try: ColumnProfilerRunner("foo") raise Exception("Did not raise TypeError") except TypeError: pass try: ColumnProfilerRunBuilder(5, self.df) raise Exception("Did not raise TypeError") except TypeError: pass try: ColumnProfilerRunBuilder(self.spark, "fail") raise Exception("Did not raise TypeError") except TypeError: pass
def test_kll_and_approxPercentiles(self): result = ColumnProfilerRunner(self.spark) \ .onData(self.df) \ .withKLLProfiling() \ .setKLLParameters(KLLParameters(self.spark, 2, 0.64, 2)) \ .run() for col, profile in result.profiles.items(): print(profile) self.assertEqual(result.profiles['b'].kll.apply(1).lowValue, 2.0) self.assertEqual(result.profiles['b'].kll.apply(1).highValue, 3.0) self.assertEqual(result.profiles['b'].kll.apply(1).count, 2) self.assertEqual(result.profiles['b'].kll.argmax, 1) self.assertIn(1.0, result.profiles['b'].approxPercentiles) self.assertIn(2.0, result.profiles['b'].approxPercentiles) self.assertIn(3.0, result.profiles['b'].approxPercentiles)
def setUpClass(cls): cls.spark = setup_pyspark().appName("test-analyzers-local").getOrCreate() cls.pydeequ_session = PyDeequSession(cls.spark) cls.AnalysisRunner = cls.pydeequ_session.createAnalysisRunner() cls.ColumnProfilerRunner = ColumnProfilerRunner(cls.spark) cls.ConstraintSuggestionRunner = ConstraintSuggestionRunner(cls.spark) cls.sc = cls.spark.sparkContext data = [("foo", 1, True, 1.0, float("nan")), ("bar", 2, False, 2.0, float("nan"))] cls.pyspark_df = cls.spark.createDataFrame(data, schema=["strings", "ints", "bools", "floats", "nans"]) cls.pandas_df = pandasDF( { "strings": ["foo", "bar"], "ints": [1, 2], "bools": [True, False], "floats": [1.0, 2.0], "nans": [np.nan, np.nan], } )
def createColumnProfileRunner(self): return ColumnProfilerRunner(self._spark_session)