def test_sparkdfdataset_persist(spark_session): df = pd.DataFrame({"a": [1, 2, 3]}) sdf = spark_session.createDataFrame(df) sdf.persist = mock.MagicMock() _ = SparkDFDataset(sdf, persist=True) sdf.persist.assert_called_once() sdf = spark_session.createDataFrame(df) sdf.persist = mock.MagicMock() _ = SparkDFDataset(sdf, persist=False) sdf.persist.assert_not_called() sdf = spark_session.createDataFrame(df) sdf.persist = mock.MagicMock() _ = SparkDFDataset(sdf) sdf.persist.assert_called_once()
def test_dataframe(spark_session): from pyspark.sql.types import IntegerType, StringType, StructField, StructType schema = StructType([ StructField("name", StringType(), True), StructField("age", IntegerType(), True), StructField( "address", StructType([ StructField("street", StringType(), True), StructField("city", StringType(), True), StructField("house_number", IntegerType(), True), ]), False, ), StructField("name_duplicate", StringType(), True), StructField("non.nested", StringType(), True), ]) rows = [ ("Alice", 1, ("Street 1", "Alabama", 10), "Alice", "a"), ("Bob", 2, ("Street 2", "Brooklyn", 11), "Bob", "b"), ("Charlie", 3, ("Street 3", "Alabama", 12), "Charlie", "c"), ] rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) return SparkDFDataset(df, persist=True)
def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None batch_kwargs.update(kwargs) reader_options = batch_kwargs.copy() if "path" in batch_kwargs: path = reader_options.pop("path") # We remove this so it is not used as a reader option reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError("Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError("Unknown reader method: %s" % reader_method, batch_kwargs) reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) if reader_method == ReaderMethods.CSV: df = reader.csv(path) elif reader_method == ReaderMethods.parquet: df = reader.parquet(path) else: raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs.query) elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (DataFrame, SparkDFDataset)): df = batch_kwargs.pop("df") # We don't want to store the actual DataFrame in kwargs if isinstance(df, SparkDFDataset): # Grab just the spark_df reference, since we want to override everything else df = df.spark_df batch_kwargs["SparkDFRef"] = True else: raise BatchKwargsError("Unrecognized batch_kwargs for spark_source", batch_kwargs) return SparkDFDataset(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs, caching=caching)
def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None batch_kwargs.update(kwargs) reader_options = batch_kwargs.copy() if "path" in batch_kwargs: path = reader_options.pop("path") # We remove this so it is not used as a reader option reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError("Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError("Unknown reader method: %s" % reader_method, batch_kwargs) reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) if reader_method == ReaderMethods.CSV: df = reader.csv(path) elif reader_method == ReaderMethods.parquet: df = reader.parquet(path) else: raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs.query) return SparkDFDataset(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs, caching=caching)
def test_expect_column_values_to_be_json_parseable(spark_session): d1 = json.dumps({"i": [1, 2, 3], "j": 35, "k": {"x": "five", "y": 5, "z": "101"}}) d2 = json.dumps({"i": 1, "j": 2, "k": [3, 4, 5]}) d3 = json.dumps({"i": "a", "j": "b", "k": "c"}) d4 = json.dumps( {"i": [4, 5], "j": [6, 7], "k": [8, 9], "l": {4: "x", 5: "y", 6: "z"}} ) inner = { "json_col": [d1, d2, d3, d4], "not_json": [4, 5, 6, 7], "py_dict": [ {"a": 1, "out": 1}, {"b": 2, "out": 4}, {"c": 3, "out": 9}, {"d": 4, "out": 16}, ], "most": [d1, d2, d3, "d4"], } data_reshaped = list(zip(*[v for _, v in inner.items()])) df = spark_session.createDataFrame( data_reshaped, ["json_col", "not_json", "py_dict", "most"] ) D = SparkDFDataset(df) D.set_default_expectation_argument("result_format", "COMPLETE") T = [ { "in": {"column": "json_col"}, "out": { "success": True, "unexpected_list": [], }, }, { "in": {"column": "not_json"}, "out": { "success": False, "unexpected_list": [4, 5, 6, 7], }, }, { "in": {"column": "py_dict"}, "out": { "success": False, "unexpected_list": [ {"a": 1, "out": 1}, {"b": 2, "out": 4}, {"c": 3, "out": 9}, {"d": 4, "out": 16}, ], }, }, { "in": {"column": "most"}, "out": { "success": False, "unexpected_list": ["d4"], }, }, { "in": {"column": "most", "mostly": 0.75}, "out": { "success": True, "unexpected_index_list": [3], "unexpected_list": ["d4"], }, }, ] for t in T: out = D.expect_column_values_to_be_json_parseable(**t["in"]) assert t["out"]["success"] == out.success assert t["out"]["unexpected_list"] == out.result["unexpected_list"]