def test_observe(self): # SPARK-36263: tests the DataFrame.observe(Observation, *Column) method from pyspark.sql import Observation df = SparkSession(self.sc).createDataFrame( [ (1, 1.0, "one"), (2, 2.0, "two"), (3, 3.0, "three"), ], ["id", "val", "label"], ) unnamed_observation = Observation() named_observation = Observation("metric") observed = ( df.orderBy("id") .observe( named_observation, count(lit(1)).alias("cnt"), sum(col("id")).alias("sum"), mean(col("val")).alias("mean"), ) .observe(unnamed_observation, count(lit(1)).alias("rows")) ) # test that observe works transparently actual = observed.collect() self.assertEqual( [ {"id": 1, "val": 1.0, "label": "one"}, {"id": 2, "val": 2.0, "label": "two"}, {"id": 3, "val": 3.0, "label": "three"}, ], [row.asDict() for row in actual], ) # test that we retrieve the metrics self.assertEqual(named_observation.get, dict(cnt=3, sum=6, mean=2.0)) self.assertEqual(unnamed_observation.get, dict(rows=3)) # observation requires name (if given) to be non empty string with self.assertRaisesRegex(TypeError, "name should be a string"): Observation(123) with self.assertRaisesRegex(ValueError, "name should not be empty"): Observation("") # dataframe.observe requires at least one expr with self.assertRaisesRegex(AssertionError, "exprs should not be empty"): df.observe(Observation()) # dataframe.observe requires non-None Columns for args in [(None,), ("id",), (lit(1), None), (lit(1), "id")]: with self.subTest(args=args): with self.assertRaisesRegex(AssertionError, "all exprs should be Column"): df.observe(Observation(), *args)
def test_observe(self): # SPARK-36263: tests the DataFrame.observe(Observation, *Column) method from pyspark.sql import Observation df = SparkSession(self.sc).createDataFrame([ (1, 1.0, 'one'), (2, 2.0, 'two'), (3, 3.0, 'three'), ], ['id', 'val', 'label']) unnamed_observation = Observation() named_observation = Observation("metric") observed = df.orderBy('id').observe( named_observation, count(lit(1)).alias('cnt'), sum(col("id")).alias('sum'), mean(col("val")).alias('mean') ).observe(unnamed_observation, count(lit(1)).alias('rows')) # test that observe works transparently actual = observed.collect() self.assertEqual([ {'id': 1, 'val': 1.0, 'label': 'one'}, {'id': 2, 'val': 2.0, 'label': 'two'}, {'id': 3, 'val': 3.0, 'label': 'three'}, ], [row.asDict() for row in actual]) # test that we retrieve the metrics self.assertEqual(named_observation.get, dict(cnt=3, sum=6, mean=2.0)) self.assertEqual(unnamed_observation.get, dict(rows=3)) # observation requires name (if given) to be non empty string with self.assertRaisesRegex(TypeError, 'name should be a string'): Observation(123) with self.assertRaisesRegex(ValueError, 'name should not be empty'): Observation('') # dataframe.observe requires at least one expr with self.assertRaisesRegex(AssertionError, 'exprs should not be empty'): df.observe(Observation()) # dataframe.observe requires non-None Columns for args in [(None,), ('id',), (lit(1), None), (lit(1), 'id')]: with self.subTest(args=args): with self.assertRaisesRegex(AssertionError, 'all exprs should be Column'): df.observe(Observation(), *args)