def get_spark_runtime_validator(context, df): spark = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "450m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. }) df = spark.createDataFrame(df) batch_request = RuntimeBatchRequest( datasource_name="my_spark_datasource", data_connector_name="my_data_connector", data_asset_name="IN_MEMORY_DATA_ASSET", runtime_parameters={"batch_data": df}, batch_identifiers={ "an_example_key": "a", "another_example_key": "b", }, ) expectation_suite = context.create_expectation_suite( "my_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite=expectation_suite) return validator
def test_spark_expect_column_value_z_scores_to_be_less_than_impl( spark_session, basic_spark_df_execution_engine ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) spark = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "450m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } ) df = spark.createDataFrame(df) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def __init__( self, *args, persist=True, spark_config=None, force_reuse_spark_context=False, **kwargs, ): # Creation of the Spark DataFrame is done outside this class self._persist = persist if spark_config is None: spark_config = {} spark: SparkSession = get_or_create_spark_application( spark_config=spark_config, force_reuse_spark_context=force_reuse_spark_context, ) spark_config = dict(spark_config) spark_config.update( {k: v for (k, v) in spark.sparkContext.getConf().getAll()}) self._spark_config = spark_config self.spark = spark super().__init__(*args, **kwargs) self._config.update({ "persist": self._persist, "spark_config": spark_config, })
def __init__(self, name="default", datasource=None, database="default"): super().__init__(name, datasource=datasource) self.database = database try: self.spark = get_or_create_spark_application() except Exception: logger.error( "Unable to load spark context; install optional spark dependency for support." ) self.spark = None
def __init__( self, name="default", data_context=None, data_asset_type=None, batch_kwargs_generators=None, spark_config=None, force_reuse_spark_context=False, **kwargs, ) -> None: """Build a new SparkDFDatasource instance. Args: name: the name of this datasource data_context: the DataContext to which this datasource is connected data_asset_type: ClassConfig describing the data_asset type to be constructed by this datasource batch_kwargs_generators: generator configuration spark_config: dictionary of key-value pairs to be set on the spark session builder **kwargs: Additional """ configuration_with_defaults = SparkDFDatasource.build_configuration( data_asset_type, batch_kwargs_generators, spark_config, force_reuse_spark_context, **kwargs, ) data_asset_type = configuration_with_defaults.pop("data_asset_type") batch_kwargs_generators = configuration_with_defaults.pop( "batch_kwargs_generators", None) super().__init__( name, data_context=data_context, data_asset_type=data_asset_type, batch_kwargs_generators=batch_kwargs_generators, **configuration_with_defaults, ) if spark_config is None: spark_config = {} spark = get_or_create_spark_application( spark_config=spark_config, force_reuse_spark_context=force_reuse_spark_context, ) self.spark = spark self._build_generators()
def __init__(self, *args, persist=True, spark_config=None, **kwargs): # Creation of the Spark DataFrame is done outside this class self._persist = persist if spark_config is None: spark_config = {} spark: SparkSession = get_or_create_spark_application( spark_config=spark_config) self.spark = spark self._spark_config = spark_config super().__init__(*args, **kwargs) self._config.update({ "persist": self._persist, "spark_config": spark_config, })
def __init__( self, *args, persist=True, spark_config=None, force_reuse_spark_context=False, **kwargs, ) -> None: # Creation of the Spark DataFrame is done outside this class self._persist = persist if spark_config is None: spark_config = {} spark: SparkSession = get_or_create_spark_application( spark_config=spark_config, force_reuse_spark_context=force_reuse_spark_context, ) spark_config = dict(spark_config) spark_config.update({k: v for (k, v) in spark.sparkContext.getConf().getAll()}) self._spark_config = spark_config self.spark = spark azure_options: dict = kwargs.pop("azure_options", {}) self._azure_options = azure_options super().__init__(*args, **kwargs) self._config.update( { "persist": self._persist, "spark_config": spark_config, "azure_options": azure_options, } ) self._data_splitter = SparkDataSplitter() self._data_sampler = SparkDataSampler()
import os import datetime import pandas as pd from ruamel import yaml from great_expectations.core.batch import RuntimeBatchRequest from great_expectations.data_context import BaseDataContext from great_expectations.data_context.types.base import ( DataContextConfig, FilesystemStoreBackendDefaults, ) from great_expectations.core.util import get_or_create_spark_application spark = get_or_create_spark_application() # 1. Install Great Expectations # %pip install great-expectations # Imports # 2. Set up Great Expectations # In-memory DataContext using DBFS and FilesystemStoreBackendDefaults # CODE vvvvv vvvvv # This root directory is for use in Databricks root_directory = "/dbfs/great_expectations/" # For testing purposes only, we change the root_directory to an ephemeral location created by our test runner root_directory = os.path.join(os.getcwd(), "dbfs_temp_directory")