def test_golden_path_sql_datasource_configuration(
        sa, empty_data_context, test_connectable_postgresql_db):
    """Tests the golden path for setting up a StreamlinedSQLDatasource using test_yaml_config"""
    context = empty_data_context

    os.chdir(context.root_directory)

    # Everything below this line (except for asserts) is what we expect users to run as part of the golden path.
    import great_expectations as ge

    context = ge.get_context()

    db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost")
    yaml_config = f"""
class_name: SimpleSqlalchemyDatasource
credentials:
    drivername: postgresql
    username: postgres
    password: ""
    host: {db_hostname}
    port: 5432
    database: test_ci

introspection:
    whole_table_with_limits:
        sampling_method: _sample_using_limit
        sampling_kwargs:
            n: 10
"""
    # noinspection PyUnusedLocal
    report_object = context.test_yaml_config(
        name="my_datasource",
        yaml_config=yaml_config,
        return_mode="report_object",
    )
    print(json.dumps(report_object, indent=2))
    print(context.datasources)

    my_batch = context.get_batch(
        "my_datasource",
        "whole_table_with_limits",
        "test_df",
    )
    # assert len(my_batch.data.fetchall()) == 10

    with pytest.raises(KeyError):
        my_batch = context.get_batch(
            "my_datasource",
            "whole_table_with_limits",
            "DOES_NOT_EXIST",
        )

    my_validator = context.get_validator(
        datasource_name="my_datasource",
        data_connector_name="whole_table_with_limits",
        data_asset_name="test_df",
        expectation_suite=ExpectationSuite("my_expectation_suite"),
    )
    my_evr = my_validator.expect_table_columns_to_match_set(column_set=[])
    print(my_evr)
コード例 #2
0
ファイル: run.py プロジェクト: rpatil524/great_expectations
def test_ge():

    CONNECTION_STRING = os.environ.get("DB_URL")

    context = ge.get_context()

    datasource_config = {
        "name": "my_datasource",
        "class_name": "Datasource",
        "execution_engine": {
            "class_name": "SqlAlchemyExecutionEngine",
            "connection_string": CONNECTION_STRING,
            "create_temp_table": False,
        },
        "data_connectors": {
            "default_runtime_data_connector_name": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"],
            },
            "default_inferred_data_connector_name": {
                "class_name": "InferredAssetSqlDataConnector",
                "include_schema_name": True,
            },
        },
    }

    # Please note this override is only to provide good UX for docs and tests.
    # In normal usage you'd set your path directly in the yaml above.
    datasource_config["execution_engine"][
        "connection_string"] = CONNECTION_STRING

    context.test_yaml_config(yaml.dump(datasource_config))

    context.add_datasource(**datasource_config)

    # First test for RuntimeBatchRequest using a query
    batch_request = RuntimeBatchRequest(
        datasource_name="my_datasource",
        data_connector_name="default_runtime_data_connector_name",
        data_asset_name=
        "default_name",  # this can be anything that identifies this data
        runtime_parameters={"query": "SELECT TOP 10 * from dbo.taxi_data"},
        batch_identifiers={"default_identifier_name": "default_identifier"},
        batch_spec_passthrough={"create_temp_table": False},
    )

    context.create_expectation_suite(expectation_suite_name="test_suite",
                                     overwrite_existing=True)
    validator = context.get_validator(batch_request=batch_request,
                                      expectation_suite_name="test_suite")
    print(validator.head())

    # NOTE: The following code is only for testing and can be ignored by users.

    assert isinstance(validator, ge.validator.validator.Validator)
コード例 #3
0
    def  build_context(self):
        """
            Purpose:
                Create a dataContext and datasource and add to object 
            Returns:
                saves dataContext and datasource to self
        """
        self.context=ge.get_context()

        #create datasource configuration
        datasource_config = {
            "name": "example_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                },
            },
        }

        #create data context configuration
        data_context_config = DataContextConfig(
            datasources={
                "pandas": DatasourceConfig(
                    class_name="Datasource",
                    execution_engine={
                        "class_name": "PandasExecutionEngine"
                    },
                    data_connectors={
                        "default_runtime_data_connector_name": {
                            "class_name": "RuntimeDataConnector",
                            "batch_identifiers": ["default_identifier_name"],
                        }
                    },
                )
            },
            store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')),
        )

        #build context and add data source
        self.context=BaseDataContext(project_config=data_context_config)
        #self.context.test_yaml_config(yaml.dump(datasource_config))
        self.context.add_datasource(**datasource_config)
import os
from typing import List

from ruamel import yaml

import great_expectations as ge
from great_expectations.core.batch import Batch, BatchRequest

CREDENTIAL = os.getenv("AZURE_ACCESS_KEY", "")

context = ge.get_context()

datasource_yaml = rf"""
name: my_azure_datasource
class_name: Datasource
execution_engine:
    class_name: SparkDFExecutionEngine
    azure_options:
        account_url: <YOUR_ACCOUNT_URL> # or `conn_str`
        credential: <YOUR_CREDENTIAL>   # if using a protected container
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
    default_inferred_data_connector_name:
        class_name: InferredAssetAzureDataConnector
        azure_options:
            account_url: <YOUR_ACCOUNT_URL> # or `conn_str`
            credential: <YOUR_CREDENTIAL>   # if using a protected container
        container: <YOUR_AZURE_CONTAINER_HERE>
def test_golden_path_configured_asset_pandas_datasource_configuration(
        empty_data_context, test_df, tmp_path_factory):
    """
    Tests the golden path for InferredAssetFilesystemDataConnector with PandasExecutionEngine using test_yaml_config
    """
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_golden_path_pandas_datasource_configuration"))

    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_foxtrot/A/A-1.csv",
            "test_dir_foxtrot/A/A-2.csv",
            "test_dir_foxtrot/A/A-3.csv",
            "test_dir_foxtrot/B/B-1.txt",
            "test_dir_foxtrot/B/B-2.txt",
            "test_dir_foxtrot/B/B-3.txt",
            "test_dir_foxtrot/C/C-2017.csv",
            "test_dir_foxtrot/C/C-2018.csv",
            "test_dir_foxtrot/C/C-2019.csv",
            "test_dir_foxtrot/D/D-aaa.csv",
            "test_dir_foxtrot/D/D-bbb.csv",
            "test_dir_foxtrot/D/D-ccc.csv",
            "test_dir_foxtrot/D/D-ddd.csv",
            "test_dir_foxtrot/D/D-eee.csv",
        ],
        file_content_fn=lambda: test_df.to_csv(header=True, index=False),
    )

    context = empty_data_context

    os.chdir(context.root_directory)
    import great_expectations as ge

    context = ge.get_context()

    yaml_config = f"""
class_name: Datasource

execution_engine:
    class_name: PandasExecutionEngine

data_connectors:
    my_filesystem_data_connector:
        class_name: ConfiguredAssetFilesystemDataConnector
        base_directory: {base_directory}
        # glob_directive: "*"

        default_regex:
            pattern: (.+)\\.csv
            group_names:
                - alphanumeric

        assets:
            A:
                base_directory: {base_directory}/test_dir_foxtrot/A
                pattern: (.+)-(\\d+)\\.csv
                group_names:
                    - letter
                    - number
            B:
                base_directory: {base_directory}/test_dir_foxtrot/B
                pattern: (.+)-(\\d+)\\.csv
                group_names:
                    - letter
                    - number
            C:
                base_directory: {base_directory}/test_dir_foxtrot/C
                pattern: (.+)-(\\d+)\\.csv
                group_names:
                    - letter
                    - year
            D:
                base_directory: {base_directory}/test_dir_foxtrot/D
                pattern: (.+)-(\\d+)\\.csv
                group_names:
                    - letter
                    - checksum
"""

    # noinspection PyUnusedLocal
    report_object = context.test_yaml_config(
        name="my_directory_datasource",
        yaml_config=yaml_config,
        return_mode="report_object",
    )
    # print(json.dumps(report_object, indent=2))
    # print(context.datasources)

    my_batch = context.get_batch(
        datasource_name="my_directory_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="A",
        batch_identifiers={
            "number": "2",
        },
        batch_spec_passthrough={
            "sampling_method": "_sample_using_hash",
            "sampling_kwargs": {
                "column_name": "date",
                "hash_function_name": "md5",
                "hash_value": "f",
            },
        },
    )
    assert my_batch.batch_definition["data_asset_name"] == "A"

    my_batch.head()

    df_data = my_batch.data.dataframe
    assert df_data.shape == (10, 10)
    df_data["date"] = df_data.apply(
        lambda row: datetime.datetime.strptime(row["date"], "%Y-%m-%d").date(),
        axis=1)
    assert (test_df[(test_df["date"] == datetime.date(2020, 1, 15))
                    | (test_df["date"] == datetime.date(2020, 1, 29))].drop(
                        "timestamp",
                        axis=1).equals(df_data.drop("timestamp", axis=1)))

    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        my_batch = context.get_batch(
            datasource_name="my_directory_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="DOES_NOT_EXIST",
        )

    my_validator = context.get_validator(
        datasource_name="my_directory_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="C",
        data_connector_query={"batch_filter_parameters": {
            "year": "2019"
        }},
        create_expectation_suite_with_name="my_expectations",
        batch_spec_passthrough={
            "sampling_method": "_sample_using_hash",
            "sampling_kwargs": {
                "column_name": "date",
                "hash_function_name": "md5",
                "hash_value": "f",
            },
        },
    )
    my_evr = my_validator.expect_column_values_to_be_between(column="d",
                                                             min_value=1,
                                                             max_value=31)
    assert my_evr.success
コード例 #6
0
    taxi_test_data: TaxiTestData = TaxiTestData(
        test_df, test_column_name="pickup_datetime")
    taxi_splitting_test_cases: TaxiSplittingTestCases = TaxiSplittingTestCases(
        taxi_test_data)

    test_cases: List[
        TaxiSplittingTestCase] = taxi_splitting_test_cases.test_cases()

    for test_case in test_cases:

        print("Testing splitter method:", test_case.splitter_method_name)

        # 1. Setup

        context: DataContext = ge.get_context()

        datasource_name: str = "test_datasource"
        data_connector_name: str = "test_data_connector"
        data_asset_name: str = table_name  # Read from generated table name
        column_name: str = taxi_splitting_test_cases.test_column_name

        # 2. Set splitter in DataConnector config
        data_connector_config: dict = {
            "class_name": "ConfiguredAssetSqlDataConnector",
            "assets": {
                data_asset_name: {
                    "splitter_method": test_case.splitter_method_name,
                    "splitter_kwargs": test_case.splitter_kwargs,
                }
            },
コード例 #7
0
tests and will be updated.  These statements can be ignored by users.

Comments with the tags `<snippet>` and `</snippet>` are used to ensure that if this script is updated
the snippets that are specified for use in documentation are maintained.  These comments can be ignored by users.

--documentation--
    https://docs.greatexpectations.io/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant
"""
import great_expectations as ge
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.core.batch import BatchRequest
from great_expectations.core.yaml_handler import YAMLHandler

yaml = YAMLHandler()

context: ge.DataContext = ge.get_context()

# Configure your datasource (if you aren't using one that already exists)

# <snippet>
datasource_config = {
    "name": "taxi_multi_batch_datasource",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "inferred_data_connector_all_years": {
            "class_name": "InferredAssetFilesystemDataConnector",
コード例 #8
0
def test_golden_path_inferred_asset_pandas_datasource_configuration(
        mock_emit, empty_data_context_stats_enabled, test_df,
        tmp_path_factory):
    """
    Tests the golden path for InferredAssetFilesystemDataConnector with PandasExecutionEngine using test_yaml_config
    """
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_golden_path_pandas_datasource_configuration"))

    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_charlie/A/A-1.csv",
            "test_dir_charlie/A/A-2.csv",
            "test_dir_charlie/A/A-3.csv",
            "test_dir_charlie/B/B-1.csv",
            "test_dir_charlie/B/B-2.csv",
            "test_dir_charlie/B/B-3.csv",
            "test_dir_charlie/C/C-1.csv",
            "test_dir_charlie/C/C-2.csv",
            "test_dir_charlie/C/C-3.csv",
            "test_dir_charlie/D/D-1.csv",
            "test_dir_charlie/D/D-2.csv",
            "test_dir_charlie/D/D-3.csv",
        ],
        file_content_fn=lambda: test_df.to_csv(header=True, index=False),
    )

    context: DataContext = empty_data_context_stats_enabled

    os.chdir(context.root_directory)
    import great_expectations as ge

    context = ge.get_context()
    mock_emit.reset_mock()  # Remove data_context.__init__ call

    yaml_config = f"""
class_name: Datasource

execution_engine:
    class_name: PandasExecutionEngine

data_connectors:
    my_filesystem_data_connector:
        class_name: InferredAssetFilesystemDataConnector
        base_directory: {base_directory}/test_dir_charlie
        glob_directive: "*/*.csv"

        default_regex:
            pattern: (.+)/(.+)-(\\d+)\\.csv
            group_names:
                - subdirectory
                - data_asset_name
                - number
"""

    # noinspection PyUnusedLocal
    report_object = context.test_yaml_config(
        name="my_directory_datasource",
        yaml_config=yaml_config,
        return_mode="report_object",
    )
    # print(json.dumps(report_object, indent=2))
    # print(context.datasources)
    assert mock_emit.call_count == 1
    # Substitute anonymized names since it changes for each run
    anonymized_datasource_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_name"]
    anonymized_execution_engine_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_execution_engine"]["anonymized_name"]
    anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"]
    expected_call_args_list = [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name":
                anonymized_datasource_name,
                "parent_class":
                "Datasource",
                "anonymized_execution_engine": {
                    "anonymized_name": anonymized_execution_engine_name,
                    "parent_class": "PandasExecutionEngine",
                },
                "anonymized_data_connectors": [{
                    "anonymized_name":
                    anonymized_data_connector_name,
                    "parent_class":
                    "InferredAssetFilesystemDataConnector",
                }],
            },
            "success": True,
        }),
    ]
    assert mock_emit.call_args_list == expected_call_args_list

    my_batch = context.get_batch(
        datasource_name="my_directory_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="A",
        batch_identifiers={
            "number": "2",
        },
        batch_spec_passthrough={
            "sampling_method": "_sample_using_hash",
            "sampling_kwargs": {
                "column_name": "date",
                "hash_function_name": "md5",
                "hash_value": "f",
            },
        },
    )
    assert my_batch.batch_definition["data_asset_name"] == "A"

    df_data = my_batch.data.dataframe
    assert df_data.shape == (10, 10)
    df_data["date"] = df_data.apply(
        lambda row: datetime.datetime.strptime(row["date"], "%Y-%m-%d").date(),
        axis=1)
    assert (test_df[(test_df["date"] == datetime.date(2020, 1, 15))
                    | (test_df["date"] == datetime.date(2020, 1, 29))].drop(
                        "timestamp",
                        axis=1).equals(df_data.drop("timestamp", axis=1)))

    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        my_batch = context.get_batch(
            datasource_name="my_directory_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="DOES_NOT_EXIST",
        )

    my_validator = context.get_validator(
        datasource_name="my_directory_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="D",
        data_connector_query={"batch_filter_parameters": {
            "number": "3"
        }},
        expectation_suite=ExpectationSuite("my_expectation_suite"),
        batch_spec_passthrough={
            "sampling_method": "_sample_using_hash",
            "sampling_kwargs": {
                "column_name": "date",
                "hash_function_name": "md5",
                "hash_value": "f",
            },
        },
    )
    my_evr = my_validator.expect_column_values_to_be_between(column="d",
                                                             min_value=1,
                                                             max_value=31)
    assert my_evr.success

    # TODO: <Alex>ALEX</Alex>
    # my_evr = my_validator.expect_table_columns_to_match_ordered_list(ordered_list=["x", "y", "z"])
    # assert my_evr.success

    # No other usage stats calls detected
    assert mock_emit.call_count == 1
コード例 #9
0
def test_golden_path_sql_datasource_configuration(
        mock_emit, empty_data_context_stats_enabled, sa,
        test_connectable_postgresql_db):
    """Tests the golden path for setting up a StreamlinedSQLDatasource using test_yaml_config"""
    context: DataContext = empty_data_context_stats_enabled

    os.chdir(context.root_directory)

    # Everything below this line (except for asserts) is what we expect users to run as part of the golden path.
    import great_expectations as ge

    context = ge.get_context()

    db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost")
    yaml_config = f"""
class_name: SimpleSqlalchemyDatasource
credentials:
    drivername: postgresql
    username: postgres
    password: ""
    host: {db_hostname}
    port: 5432
    database: test_ci

introspection:
    whole_table_with_limits:
        sampling_method: _sample_using_limit
        sampling_kwargs:
            n: 10
"""
    # noinspection PyUnusedLocal
    report_object = context.test_yaml_config(
        name="my_datasource",
        yaml_config=yaml_config,
        return_mode="report_object",
    )
    assert mock_emit.call_count == 2
    # Substitute anonymized names since it changes for each run
    anonymized_datasource_name = mock_emit.call_args_list[1][0][0][
        "event_payload"]["anonymized_name"]
    anonymized_data_connector_name = mock_emit.call_args_list[1][0][0][
        "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"]
    expected_call_args_list = [
        mock.call({
            "event_payload": {},
            "event": "data_context.__init__",
            "success": True
        }),
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name":
                anonymized_datasource_name,
                "parent_class":
                "SimpleSqlalchemyDatasource",
                "anonymized_execution_engine": {
                    "parent_class": "SqlAlchemyExecutionEngine"
                },
                "anonymized_data_connectors": [{
                    "anonymized_name":
                    anonymized_data_connector_name,
                    "parent_class":
                    "InferredAssetSqlDataConnector",
                }],
            },
            "success": True,
        }),
    ]
    assert mock_emit.call_args_list == expected_call_args_list

    print(json.dumps(report_object, indent=2))
    print(context.datasources)

    my_batch = context.get_batch(
        "my_datasource",
        "whole_table_with_limits",
        "test_df",
    )
    # assert len(my_batch.data.fetchall()) == 10

    with pytest.raises(KeyError):
        my_batch = context.get_batch(
            "my_datasource",
            "whole_table_with_limits",
            "DOES_NOT_EXIST",
        )

    my_validator = context.get_validator(
        datasource_name="my_datasource",
        data_connector_name="whole_table_with_limits",
        data_asset_name="test_df",
        expectation_suite=ExpectationSuite("my_expectation_suite"),
    )
    my_evr = my_validator.expect_table_columns_to_match_set(column_set=[])
    print(my_evr)