Exemple #1
0
    def _ge_context(self) -> Iterator[GEContext]:
        with self.base_engine.connect() as conn:
            data_context = BaseDataContext(project_config=DataContextConfig(
                # The datasource will be added via add_datasource().
                datasources={},
                store_backend_defaults=InMemoryStoreBackendDefaults(),
                anonymous_usage_statistics={
                    "enabled": False,
                    # "data_context_id": <not set>,
                },
            ))

            datasource_name = f"{self._datasource_name_base}-{uuid.uuid4()}"
            datasource_config = DatasourceConfig(
                class_name="SqlAlchemyDatasource",
                credentials={
                    # This isn't actually used since we pass the connection directly,
                    # but GE parses it to change some of its behavior so it's useful
                    # to emulate that here.
                    "url": conn.engine.url,
                },
            )
            with _inject_connection_into_datasource(conn):
                # Using the add_datasource method ensures that the datasource is added to
                # GE-internal cache, which avoids problems when calling GE methods later on.
                assert data_context.add_datasource(
                    datasource_name,
                    initialize=True,
                    **dict(datasourceConfigSchema.dump(datasource_config)),
                )
            assert data_context.get_datasource(datasource_name)

            yield GEContext(data_context, datasource_name)
    def __init__(self, conn, report):
        self.conn = conn
        self.report = report

        data_context_config = DataContextConfig(
            datasources={
                self.datasource_name:
                DatasourceConfig(
                    class_name="SqlAlchemyDatasource",
                    credentials={
                        # This isn't actually used since we pass the connection directly,
                        # but GE parses it to change some of its behavior so it's useful
                        # to emulate that here.
                        "url": self.conn.engine.url,
                    },
                )
            },
            store_backend_defaults=InMemoryStoreBackendDefaults(),
            anonymous_usage_statistics={
                "enabled": False,
                # "data_context_id": <not set>,
            },
        )

        with _properly_init_datasource(self.conn):
            self.data_context = BaseDataContext(
                project_config=data_context_config)
def test_data_context_concurrency_property():
    data_context = BaseDataContext(
        project_config=DataContextConfig(
            concurrency=ConcurrencyConfig(enabled=True),
            store_backend_defaults=InMemoryStoreBackendDefaults(),
        )
    )
    assert data_context.concurrency.enabled
Exemple #4
0
    def validate_with_great_expectations(
        self,
        dataframe: TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
        expectation_suite: TypeVar("ge.core.ExpectationSuite"),  # noqa: F821
        ge_validate_kwargs: Optional[dict],
    ):
        # NOTE: InMemoryStoreBackendDefaults SHOULD NOT BE USED in normal settings. You
        # may experience data loss as it persists nothing. It is used here for testing.
        # Please refer to docs to learn how to instantiate your DataContext.
        store_backend_defaults = InMemoryStoreBackendDefaults()
        data_context_config = DataContextConfig(
            store_backend_defaults=store_backend_defaults,
            checkpoint_store_name=store_backend_defaults.checkpoint_store_name,
        )
        context = BaseDataContext(project_config=data_context_config)

        datasource = {
            "name": "my_spark_dataframe",
            "class_name": "Datasource",
            "execution_engine": {
                "class_name": "SparkDFExecutionEngine",
                "force_reuse_spark_context": True,
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["batch_id"],
                }
            },
        }
        context.add_datasource(**datasource)

        # Here is a RuntimeBatchRequest using a dataframe
        batch_request = RuntimeBatchRequest(
            datasource_name="my_spark_dataframe",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name=
            "<YOUR_MEANGINGFUL_NAME>",  # This can be anything that identifies this data_asset for you
            batch_identifiers={"batch_id": "default_identifier"},
            runtime_parameters={"batch_data":
                                dataframe},  # Your dataframe goes here
        )
        context.save_expectation_suite(expectation_suite)
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite.expectation_suite_name,
        )
        report = validator.validate(**ge_validate_kwargs)

        return report
def build_in_memory_runtime_context():
    data_context_config: DataContextConfig = DataContextConfig(
        datasources={
            "pandas_datasource": {
                "execution_engine": {
                    "class_name": "PandasExecutionEngine",
                    "module_name": "great_expectations.execution_engine",
                },
                "class_name": "Datasource",
                "module_name": "great_expectations.datasource",
                "data_connectors": {
                    "runtime_data_connector": {
                        "class_name": "RuntimeDataConnector",
                        "batch_identifiers": [
                            "id_key_0",
                            "id_key_1",
                        ],
                    }
                },
            },
            "spark_datasource": {
                "execution_engine": {
                    "class_name": "SparkDFExecutionEngine",
                    "module_name": "great_expectations.execution_engine",
                },
                "class_name": "Datasource",
                "module_name": "great_expectations.datasource",
                "data_connectors": {
                    "runtime_data_connector": {
                        "class_name": "RuntimeDataConnector",
                        "batch_identifiers": [
                            "id_key_0",
                            "id_key_1",
                        ],
                    }
                },
            },
        },
        expectations_store_name="expectations_store",
        validations_store_name="validations_store",
        evaluation_parameter_store_name="evaluation_parameter_store",
        checkpoint_store_name="checkpoint_store",
        store_backend_defaults=InMemoryStoreBackendDefaults(),
    )

    context: BaseDataContext = BaseDataContext(
        project_config=data_context_config)

    return context
Exemple #6
0
from typing import List

from ruamel import yaml

import great_expectations as ge
from great_expectations.core.batch import Batch, BatchRequest, RuntimeBatchRequest
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import (
    DataContextConfig,
    InMemoryStoreBackendDefaults,
)

# NOTE: InMemoryStoreBackendDefaults SHOULD NOT BE USED in normal settings. You
# may experience data loss as it persists nothing. It is used here for testing.
# Please refer to docs to learn how to instantiate your DataContext.
store_backend_defaults = InMemoryStoreBackendDefaults()
data_context_config = DataContextConfig(
    store_backend_defaults=store_backend_defaults,
    checkpoint_store_name=store_backend_defaults.checkpoint_store_name,
)
context = BaseDataContext(project_config=data_context_config)

datasource_config = {
    "name": "my_gcs_datasource",
    "class_name": "Datasource",
    "execution_engine": {
        "class_name": "SparkDFExecutionEngine"
    },
    "data_connectors": {
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
def _create_context(
    backend_api: str,
    datasource_name: str,
    data_connector_name: str,
    asset_names: List[str],
    html_dir: Optional[str] = None,
) -> DataContext:

    data_docs_sites = (
        {
            "local_site": {
                "class_name": "SiteBuilder",
                "show_how_to_buttons": False,
                "store_backend": {
                    "class_name": "TupleFilesystemStoreBackend",
                    "base_directory": html_dir,
                },
            }
        }
        if html_dir
        else None
    )
    bigquery_project = os.environ.get("GE_TEST_GCP_PROJECT")
    if not bigquery_project:
        raise ValueError(
            "Environment Variable GE_TEST_GCP_PROJECT is required to run BigQuery performance tests"
        )
    bigquery_dataset = os.environ.get("GE_TEST_BIGQUERY_PERFORMANCE_DATASET")
    if not bigquery_dataset:
        raise ValueError(
            "Environment Variable GE_TEST_BIGQUERY_PERFORMANCE_DATASET is required to run BigQuery performance tests"
        )

    data_context_config = DataContextConfig(
        store_backend_defaults=InMemoryStoreBackendDefaults(),
        data_docs_sites=data_docs_sites,
        anonymous_usage_statistics={"enabled": False},
        concurrency=concurrency_config(),
    )

    context = BaseDataContext(project_config=data_context_config)

    if backend_api == "V3":
        datasource_config = {
            "name": datasource_name,
            "class_name": "Datasource",
            "execution_engine": {
                "class_name": "SqlAlchemyExecutionEngine",
                "connection_string": f"bigquery://{bigquery_project}/{bigquery_dataset}",
            },
            "data_connectors": {
                data_connector_name: {
                    "class_name": "ConfiguredAssetSqlDataConnector",
                    "name": "whole_table",
                    "assets": {asset_name: {} for asset_name in asset_names},
                },
            },
        }
    elif backend_api == "V2":
        datasource_config = {
            "name": datasource_name,
            "credentials": {
                "url": f"bigquery://{bigquery_project}/{bigquery_dataset}",
            },
            "class_name": "SqlAlchemyDatasource",
            "module_name": "great_expectations.datasource",
            "batch_kwargs_generators": {},
            "data_asset_type": {
                "module_name": "great_expectations.dataset",
                "class_name": "SqlAlchemyDataset",
            },
        }
    else:
        raise ValueError(f"Unsupported backend_api {backend_api}")

    context.add_datasource(**datasource_config)
    return context