Esempio n. 1
0
def get_spark_runtime_validator(context, df):
    spark = get_or_create_spark_application(
        spark_config={
            "spark.sql.catalogImplementation": "hive",
            "spark.executor.memory": "450m",
            # "spark.driver.allowMultipleContexts": "true",  # This directive does not appear to have any effect.
        })
    df = spark.createDataFrame(df)
    batch_request = RuntimeBatchRequest(
        datasource_name="my_spark_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="IN_MEMORY_DATA_ASSET",
        runtime_parameters={"batch_data": df},
        batch_identifiers={
            "an_example_key": "a",
            "another_example_key": "b",
        },
    )

    expectation_suite = context.create_expectation_suite(
        "my_suite", overwrite_existing=True)

    validator = context.get_validator(batch_request=batch_request,
                                      expectation_suite=expectation_suite)

    return validator
def test_spark_expect_column_value_z_scores_to_be_less_than_impl(
    spark_session, basic_spark_df_execution_engine
):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    spark = get_or_create_spark_application(
        spark_config={
            "spark.sql.catalogImplementation": "hive",
            "spark.executor.memory": "450m",
            # "spark.driver.allowMultipleContexts": "true",  # This directive does not appear to have any effect.
        }
    )
    df = spark.createDataFrame(df)

    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration)
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="my_id", batch_data=df)
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(
        success=True,
    )
Esempio n. 3
0
    def __init__(
        self,
        *args,
        persist=True,
        spark_config=None,
        force_reuse_spark_context=False,
        **kwargs,
    ):
        # Creation of the Spark DataFrame is done outside this class
        self._persist = persist

        if spark_config is None:
            spark_config = {}

        spark: SparkSession = get_or_create_spark_application(
            spark_config=spark_config,
            force_reuse_spark_context=force_reuse_spark_context,
        )

        spark_config = dict(spark_config)
        spark_config.update(
            {k: v
             for (k, v) in spark.sparkContext.getConf().getAll()})

        self._spark_config = spark_config
        self.spark = spark

        super().__init__(*args, **kwargs)

        self._config.update({
            "persist": self._persist,
            "spark_config": spark_config,
        })
 def __init__(self, name="default", datasource=None, database="default"):
     super().__init__(name, datasource=datasource)
     self.database = database
     try:
         self.spark = get_or_create_spark_application()
     except Exception:
         logger.error(
             "Unable to load spark context; install optional spark dependency for support."
         )
         self.spark = None
Esempio n. 5
0
    def __init__(
        self,
        name="default",
        data_context=None,
        data_asset_type=None,
        batch_kwargs_generators=None,
        spark_config=None,
        force_reuse_spark_context=False,
        **kwargs,
    ) -> None:
        """Build a new SparkDFDatasource instance.

        Args:
            name: the name of this datasource
            data_context: the DataContext to which this datasource is connected
            data_asset_type: ClassConfig describing the data_asset type to be constructed by this datasource
            batch_kwargs_generators: generator configuration
            spark_config: dictionary of key-value pairs to be set on the spark session builder
            **kwargs: Additional
        """
        configuration_with_defaults = SparkDFDatasource.build_configuration(
            data_asset_type,
            batch_kwargs_generators,
            spark_config,
            force_reuse_spark_context,
            **kwargs,
        )
        data_asset_type = configuration_with_defaults.pop("data_asset_type")
        batch_kwargs_generators = configuration_with_defaults.pop(
            "batch_kwargs_generators", None)
        super().__init__(
            name,
            data_context=data_context,
            data_asset_type=data_asset_type,
            batch_kwargs_generators=batch_kwargs_generators,
            **configuration_with_defaults,
        )

        if spark_config is None:
            spark_config = {}
        spark = get_or_create_spark_application(
            spark_config=spark_config,
            force_reuse_spark_context=force_reuse_spark_context,
        )
        self.spark = spark

        self._build_generators()
    def __init__(self, *args, persist=True, spark_config=None, **kwargs):
        # Creation of the Spark DataFrame is done outside this class
        self._persist = persist

        if spark_config is None:
            spark_config = {}

        spark: SparkSession = get_or_create_spark_application(
            spark_config=spark_config)
        self.spark = spark
        self._spark_config = spark_config

        super().__init__(*args, **kwargs)

        self._config.update({
            "persist": self._persist,
            "spark_config": spark_config,
        })
Esempio n. 7
0
    def __init__(
        self,
        *args,
        persist=True,
        spark_config=None,
        force_reuse_spark_context=False,
        **kwargs,
    ) -> None:
        # Creation of the Spark DataFrame is done outside this class
        self._persist = persist

        if spark_config is None:
            spark_config = {}

        spark: SparkSession = get_or_create_spark_application(
            spark_config=spark_config,
            force_reuse_spark_context=force_reuse_spark_context,
        )

        spark_config = dict(spark_config)
        spark_config.update({k: v for (k, v) in spark.sparkContext.getConf().getAll()})

        self._spark_config = spark_config
        self.spark = spark

        azure_options: dict = kwargs.pop("azure_options", {})
        self._azure_options = azure_options

        super().__init__(*args, **kwargs)

        self._config.update(
            {
                "persist": self._persist,
                "spark_config": spark_config,
                "azure_options": azure_options,
            }
        )

        self._data_splitter = SparkDataSplitter()
        self._data_sampler = SparkDataSampler()
import os
import datetime

import pandas as pd
from ruamel import yaml

from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import (
    DataContextConfig,
    FilesystemStoreBackendDefaults,
)

from great_expectations.core.util import get_or_create_spark_application

spark = get_or_create_spark_application()

# 1. Install Great Expectations
# %pip install great-expectations
# Imports

# 2. Set up Great Expectations
# In-memory DataContext using DBFS and FilesystemStoreBackendDefaults

# CODE vvvvv vvvvv
# This root directory is for use in Databricks
root_directory = "/dbfs/great_expectations/"

# For testing purposes only, we change the root_directory to an ephemeral location created by our test runner
root_directory = os.path.join(os.getcwd(), "dbfs_temp_directory")