コード例 #1
0
    def __init__(self, conn, report):
        self.conn = conn
        self.report = report

        data_context_config = DataContextConfig(
            datasources={
                self.datasource_name:
                DatasourceConfig(
                    class_name="SqlAlchemyDatasource",
                    credentials={
                        # This isn't actually used since we pass the connection directly,
                        # but GE parses it to change some of its behavior so it's useful
                        # to emulate that here.
                        "url": self.conn.engine.url,
                    },
                )
            },
            store_backend_defaults=InMemoryStoreBackendDefaults(),
            anonymous_usage_statistics={
                "enabled": False,
                # "data_context_id": <not set>,
            },
        )

        with _properly_init_datasource(self.conn):
            self.data_context = BaseDataContext(
                project_config=data_context_config)
コード例 #2
0
    def _ge_context(self) -> Iterator[GEContext]:
        with self.base_engine.connect() as conn:
            data_context = BaseDataContext(project_config=DataContextConfig(
                # The datasource will be added via add_datasource().
                datasources={},
                store_backend_defaults=InMemoryStoreBackendDefaults(),
                anonymous_usage_statistics={
                    "enabled": False,
                    # "data_context_id": <not set>,
                },
            ))

            datasource_name = f"{self._datasource_name_base}-{uuid.uuid4()}"
            datasource_config = DatasourceConfig(
                class_name="SqlAlchemyDatasource",
                credentials={
                    # This isn't actually used since we pass the connection directly,
                    # but GE parses it to change some of its behavior so it's useful
                    # to emulate that here.
                    "url": conn.engine.url,
                },
            )
            with _inject_connection_into_datasource(conn):
                # Using the add_datasource method ensures that the datasource is added to
                # GE-internal cache, which avoids problems when calling GE methods later on.
                assert data_context.add_datasource(
                    datasource_name,
                    initialize=True,
                    **dict(datasourceConfigSchema.dump(datasource_config)),
                )
            assert data_context.get_datasource(datasource_name)

            yield GEContext(data_context, datasource_name)
コード例 #3
0
def test_in_memory_data_context_configuration(
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled.get_config(
        mode=ConfigOutputModes.DICT)
    project_config_dict["plugins_directory"] = None
    project_config_dict["validation_operators"] = {
        "action_list_operator": {
            "class_name":
            "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction"
                    },
                },
                {
                    "name": "store_evaluation_params",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction"
                    },
                },
                {
                    "name": "update_data_docs",
                    "action": {
                        "class_name": "UpdateDataDocsAction"
                    },
                },
            ],
        }
    }

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    project_config_dict = dataContextConfigSchema.dump(project_config_dict)
    project_config_dict = dataContextConfigSchema.load(project_config_dict)

    project_config: DataContextConfig = DataContextConfig(
        **project_config_dict)
    data_context = BaseDataContext(
        project_config=project_config,
        context_root_dir=
        titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled
        .root_directory,
    )

    my_validator: Validator = data_context.get_validator(
        datasource_name="my_datasource",
        data_connector_name="my_basic_data_connector",
        data_asset_name="Titanic_1912",
        create_expectation_suite_with_name="my_test_titanic_expectation_suite",
    )

    assert my_validator.expect_table_row_count_to_equal(1313)["success"]
    assert my_validator.expect_table_column_count_to_equal(7)["success"]
コード例 #4
0
def test__normalize_absolute_or_relative_path(tmp_path_factory, basic_data_context_config):
    config_path = str(tmp_path_factory.mktemp('test__normalize_absolute_or_relative_path__dir'))
    context = BaseDataContext(
        basic_data_context_config,
        config_path,
    )

    assert str(os.path.join("test__normalize_absolute_or_relative_path__dir0", "yikes")) in context._normalize_absolute_or_relative_path("yikes")

    assert "test__normalize_absolute_or_relative_path__dir" not in context._normalize_absolute_or_relative_path("/yikes")
    assert "/yikes" == context._normalize_absolute_or_relative_path("/yikes")
コード例 #5
0
def test_in_memory_data_context_configuration(
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store,
):
    project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store.get_config(
        mode="dict")
    project_config_dict["plugins_directory"] = None
    project_config_dict["validation_operators"] = {
        "action_list_operator": {
            "class_name":
            "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction"
                    },
                },
                {
                    "name": "store_evaluation_params",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction"
                    },
                },
                {
                    "name": "update_data_docs",
                    "action": {
                        "class_name": "UpdateDataDocsAction"
                    },
                },
            ],
        }
    }
    project_config: DataContextConfig = DataContextConfig(
        **project_config_dict)
    data_context = BaseDataContext(
        project_config=project_config,
        context_root_dir=
        titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store
        .root_directory,
    )

    my_validator: Validator = data_context.get_validator(
        datasource_name="my_datasource",
        data_connector_name="my_basic_data_connector",
        data_asset_name="Titanic_1912",
        create_expectation_suite_with_name="my_test_titanic_expectation_suite",
    )

    assert my_validator.expect_table_row_count_to_equal(1313)["success"]
    assert my_validator.expect_table_column_count_to_equal(7)["success"]
コード例 #6
0
    def  build_context(self):
        """
            Purpose:
                Create a dataContext and datasource and add to object 
            Returns:
                saves dataContext and datasource to self
        """
        self.context=ge.get_context()

        #create datasource configuration
        datasource_config = {
            "name": "example_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                },
            },
        }

        #create data context configuration
        data_context_config = DataContextConfig(
            datasources={
                "pandas": DatasourceConfig(
                    class_name="Datasource",
                    execution_engine={
                        "class_name": "PandasExecutionEngine"
                    },
                    data_connectors={
                        "default_runtime_data_connector_name": {
                            "class_name": "RuntimeDataConnector",
                            "batch_identifiers": ["default_identifier_name"],
                        }
                    },
                )
            },
            store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')),
        )

        #build context and add data source
        self.context=BaseDataContext(project_config=data_context_config)
        #self.context.test_yaml_config(yaml.dump(datasource_config))
        self.context.add_datasource(**datasource_config)
def test_usage_statistics_handler_build_envelope(
        in_memory_data_context_config_usage_stats_enabled,
        sample_partial_message):
    """This test is for a happy path only but will fail if there is an exception thrown in build_envelope"""

    context: BaseDataContext = BaseDataContext(
        in_memory_data_context_config_usage_stats_enabled)

    usage_statistics_handler = UsageStatisticsHandler(
        data_context=context,
        data_context_id=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.data_context_id,
        usage_statistics_url=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.usage_statistics_url,
    )

    assert (usage_statistics_handler._data_context_id ==
            "00000000-0000-0000-0000-000000000001")

    envelope = usage_statistics_handler.build_envelope(sample_partial_message)
    required_keys = [
        "event",
        "event_payload",
        "version",
        "ge_version",
        "data_context_id",
        "data_context_instance_id",
        "event_time",
    ]
    assert all([key in envelope.keys() for key in required_keys])

    assert envelope["version"] == "1.0.0"
    assert envelope[
        "data_context_id"] == "00000000-0000-0000-0000-000000000001"
コード例 #8
0
def test_opt_out_env_var_overrides_home_folder(
        in_memory_data_context_config_usage_stats_enabled, tmp_path_factory,
        monkeypatch):
    monkeypatch.delenv("GE_USAGE_STATS",
                       raising=False)  # Undo the project-wide test default
    home_config_dir = tmp_path_factory.mktemp("home_dir")
    home_config_dir = str(home_config_dir)
    etc_config_dir = tmp_path_factory.mktemp("etc")
    etc_config_dir = str(etc_config_dir)
    config_dirs = [home_config_dir, etc_config_dir]
    config_dirs = [
        os.path.join(config_dir, "great_expectations.conf")
        for config_dir in config_dirs
    ]

    enabled_config = configparser.ConfigParser()
    enabled_config["anonymous_usage_statistics"] = {"enabled": "True"}

    with open(os.path.join(home_config_dir, "great_expectations.conf"),
              "w") as configfile:
        enabled_config.write(configfile)

    monkeypatch.setenv("GE_USAGE_STATS", "False")

    with mock.patch(
            "great_expectations.data_context.AbstractDataContext.GLOBAL_CONFIG_PATHS",
            config_dirs,
    ):
        assert (in_memory_data_context_config_usage_stats_enabled.
                anonymous_usage_statistics.enabled is True)
        context = BaseDataContext(
            in_memory_data_context_config_usage_stats_enabled)
        project_config = context._project_config
        assert project_config.anonymous_usage_statistics.enabled is False
def test_get_site_names_with_three_sites(tmpdir, basic_data_context_config):
    basic_data_context_config.data_docs_sites = {}
    for i in range(3):
        basic_data_context_config.data_docs_sites[f"site-{i}"] = {
            "class_name": "SiteBuilder",
            "store_backend": {
                "class_name": "TupleFilesystemStoreBackend",
                "base_directory": f"uncommitted/data_docs/site-{i}/",
            },
            "site_index_builder": {
                "class_name": "DefaultSiteIndexBuilder"
            },
        }
    context = BaseDataContext(basic_data_context_config,
                              context_root_dir=tmpdir)
    assert context.get_site_names() == ["site-0", "site-1", "site-2"]
def test_usage_statistics_handler_validate_message_success(
        caplog, in_memory_data_context_config_usage_stats_enabled,
        sample_partial_message):

    # caplog default is WARNING and above, we want to see DEBUG level messages for this test
    caplog.set_level(
        level=logging.DEBUG,
        logger="great_expectations.core.usage_statistics.usage_statistics",
    )

    context: BaseDataContext = BaseDataContext(
        in_memory_data_context_config_usage_stats_enabled)

    usage_statistics_handler = UsageStatisticsHandler(
        data_context=context,
        data_context_id=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.data_context_id,
        usage_statistics_url=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.usage_statistics_url,
    )

    assert (usage_statistics_handler._data_context_id ==
            "00000000-0000-0000-0000-000000000001")

    envelope = usage_statistics_handler.build_envelope(sample_partial_message)
    validated_message = usage_statistics_handler.validate_message(
        envelope, anonymized_usage_statistics_record_schema)

    assert validated_message
    assert not usage_stats_invalid_messages_exist(caplog.messages)
コード例 #11
0
def test_ConfigOnlyDataContext_v013__initialization(
    tmp_path_factory, basic_data_context_v013_config
):
    config_path = str(
        tmp_path_factory.mktemp("test_ConfigOnlyDataContext__initialization__dir")
    )
    context = BaseDataContext(
        basic_data_context_v013_config,
        config_path,
    )

    assert len(context.plugins_directory.split("/")[-3:]) == 3
    assert "" in context.plugins_directory.split("/")[-3:]

    pattern = re.compile(r"test_ConfigOnlyDataContext__initialization__dir\d*")
    assert (
        len(
            list(
                filter(
                    lambda element: element,
                    sorted(
                        pattern.match(element) is not None
                        for element in context.plugins_directory.split("/")[-3:]
                    ),
                )
            )
        )
        == 1
    )
コード例 #12
0
def test_opt_out_etc(in_memory_data_context_config_usage_stats_enabled,
                     tmp_path_factory):
    home_config_dir = tmp_path_factory.mktemp("home_dir")
    home_config_dir = str(home_config_dir)
    etc_config_dir = tmp_path_factory.mktemp("etc")
    etc_config_dir = str(etc_config_dir)
    config_dirs = [home_config_dir, etc_config_dir]
    config_dirs = [
        os.path.join(config_dir, "great_expectations.conf")
        for config_dir in config_dirs
    ]

    enabled_config = configparser.ConfigParser()
    enabled_config["anonymous_usage_statistics"] = {"enabled": True}

    disabled_config = configparser.ConfigParser()
    disabled_config["anonymous_usage_statistics"] = {"enabled": False}

    with open(os.path.join(etc_config_dir, "great_expectations.conf"),
              'w') as configfile:
        disabled_config.write(configfile)

    with mock.patch(
            "great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS",
            config_dirs):
        assert in_memory_data_context_config_usage_stats_enabled.anonymous_usage_statistics.enabled is True
        context = BaseDataContext(
            in_memory_data_context_config_usage_stats_enabled)
        project_config = context._project_config
        assert project_config.anonymous_usage_statistics.enabled is False
コード例 #13
0
def test_opt_out_environment_variable(in_memory_data_context_config_usage_stats_enabled, monkeypatch):
    """Set the env variable GE_USAGE_STATS value to any of the following: FALSE, False, false, 0"""
    monkeypatch.setenv("GE_USAGE_STATS", "False")
    assert in_memory_data_context_config_usage_stats_enabled.anonymous_usage_statistics.enabled is True
    context = BaseDataContext(in_memory_data_context_config_usage_stats_enabled)
    project_config = context._project_config
    assert project_config.anonymous_usage_statistics.enabled is False
コード例 #14
0
def test_opt_out_etc(in_memory_data_context_config_usage_stats_enabled,
                     tmp_path_factory, monkeypatch):
    monkeypatch.delenv("GE_USAGE_STATS",
                       raising=False)  # Undo the project-wide test default
    home_config_dir = tmp_path_factory.mktemp("home_dir")
    home_config_dir = str(home_config_dir)
    etc_config_dir = tmp_path_factory.mktemp("etc")
    etc_config_dir = str(etc_config_dir)
    config_dirs = [home_config_dir, etc_config_dir]
    config_dirs = [
        os.path.join(config_dir, "great_expectations.conf")
        for config_dir in config_dirs
    ]

    for false_string in ["False", "false", "f", "FALSE"]:
        disabled_config = configparser.ConfigParser()
        disabled_config["anonymous_usage_statistics"] = {
            "enabled": false_string
        }

        with open(os.path.join(etc_config_dir, "great_expectations.conf"),
                  "w") as configfile:
            disabled_config.write(configfile)

        with mock.patch(
                "great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS",
                config_dirs,
        ):
            assert (in_memory_data_context_config_usage_stats_enabled.
                    anonymous_usage_statistics.enabled is True)
            context = BaseDataContext(
                deepcopy(in_memory_data_context_config_usage_stats_enabled))
            project_config = context._project_config
            assert project_config.anonymous_usage_statistics.enabled is False
コード例 #15
0
def _add_checkpoint(
    context: BaseDataContext,
    backend_api: str,
    datasource_name: str,
    data_connector_name: str,
    checkpoint_name: str,
    suite_and_asset_names=[],
) -> SimpleCheckpoint:
    if backend_api == "V3":
        validations = [
            {
                "expectation_suite_name": suite_and_asset_name,
                "batch_request": {
                    "datasource_name": datasource_name,
                    "data_connector_name": data_connector_name,
                    "data_asset_name": suite_and_asset_name,
                    "batch_spec_passthrough": {"create_temp_table": False},
                },
            }
            for suite_and_asset_name in suite_and_asset_names
        ]
        return context.add_checkpoint(
            name=checkpoint_name,
            class_name="SimpleCheckpoint",
            validations=validations,
            run_name_template="my_run_name",
        )
    elif backend_api == "V2":
        batches = [
            {
                "expectation_suite_names": [suite_and_asset_name],
                "batch_kwargs": {
                    "datasource": datasource_name,
                    "data_asset_name": suite_and_asset_name,
                    "table": suite_and_asset_name,
                    "batch_spec_passthrough": {"create_temp_table": False},
                },
            }
            for suite_and_asset_name in suite_and_asset_names
        ]
        return context.add_checkpoint(
            name=checkpoint_name,
            class_name="LegacyCheckpoint",
            batches=batches,
        )
    else:
        raise ValueError(f"Unsupported backend_api {backend_api}")
コード例 #16
0
def context_with_two_sites(titanic_data_context_stats_enabled_config_version_3):
    context = titanic_data_context_stats_enabled_config_version_3
    config = context.get_config_with_variables_substituted()
    config.data_docs_sites["team_site"] = {
        "class_name": "SiteBuilder",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": "uncommitted/data_docs/team_site/",
        },
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    }
    temp_context = BaseDataContext(config, context_root_dir=context.root_directory)
    new_context = DataContext(context.root_directory)
    new_context.set_config(temp_context.get_config_with_variables_substituted())
    new_context._save_project_config()
    assert new_context.get_site_names() == ["local_site", "team_site"]
    return new_context
コード例 #17
0
def test_data_context_concurrency_property():
    data_context = BaseDataContext(
        project_config=DataContextConfig(
            concurrency=ConcurrencyConfig(enabled=True),
            store_backend_defaults=InMemoryStoreBackendDefaults(),
        )
    )
    assert data_context.concurrency.enabled
コード例 #18
0
def test_consistent_name_anonymization(in_memory_data_context_config_usage_stats_enabled, monkeypatch):
    monkeypatch.delenv("GE_USAGE_STATS", raising=False)  # Undo the project-wide test default
    context = BaseDataContext(in_memory_data_context_config_usage_stats_enabled)
    assert context.data_context_id == "00000000-0000-0000-0000-000000000001"
    payload = run_validation_operator_usage_statistics(
        context, "action_list_operator",
        assets_to_validate=[({"__fake_batch_kwargs": "mydatasource"}, "__fake_expectation_suite_name")], run_id="foo")
    # For a *specific* data_context_id, all names will be consistently anonymized
    assert payload["anonymized_operator_name"] == 'e079c942d946b823312054118b3b6ef4'
コード例 #19
0
def test_inaccessible_active_bucket_warning_messages(caplog):
    """
    What does this test do and why?

    Trying to create a data context with unreachable ACTIVE stores should show an warning message once per store
    e.g. Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named expectations_S3_store
    Active stores are those named in:
    "expectations_store_name", "validations_store_name", "evaluation_parameter_store_name"
    """

    bucket = "leakybucket"
    expectations_store_prefix = "expectations_store_prefix"
    validations_store_prefix = "validations_store_prefix"
    data_docs_store_prefix = "data_docs_store_prefix"

    # Create a bucket in Moto's mock AWS environment
    conn = boto3.resource("s3", region_name="us-east-1")
    conn.create_bucket(Bucket=bucket)

    # Create a DataContext
    # Add inactive stores
    inactive_bucket = "inactive_leakybucket"
    stores = {
        "expectations_S3_store": {
            "class_name": "ExpectationsStore",
            "store_backend": {
                "class_name": "TupleS3StoreBackend",
                "bucket": inactive_bucket,
                "prefix": expectations_store_prefix,
            },
        },
        "validations_S3_store": {
            "class_name": "ValidationsStore",
            "store_backend": {
                "class_name": "TupleS3StoreBackend",
                "bucket": inactive_bucket,
                "prefix": validations_store_prefix,
            },
        },
        "evaluation_parameter_store": {
            "class_name": "EvaluationParameterStore"
        },
    }
    in_code_data_context_project_config = build_in_code_data_context_project_config(
        bucket="leakybucket",
        expectations_store_prefix=expectations_store_prefix,
        validations_store_prefix=validations_store_prefix,
        data_docs_store_prefix=data_docs_store_prefix,
        stores=stores,
    )
    _ = BaseDataContext(project_config=in_code_data_context_project_config)
    assert (caplog.messages.count(
        "Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named expectations_S3_store"
    ) == 1)
    assert (caplog.messages.count(
        "Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named validations_S3_store"
    ) == 1)
コード例 #20
0
def test_ConfigOnlyDataContext__initialization(tmp_path_factory, basic_data_context_config):
    config_path = str(tmp_path_factory.mktemp('test_ConfigOnlyDataContext__initialization__dir'))
    context = BaseDataContext(
        basic_data_context_config,
        config_path,
    )

    assert context.root_directory.split("/")[-1] == "test_ConfigOnlyDataContext__initialization__dir0"
    assert context.plugins_directory.split("/")[-3:] == ["test_ConfigOnlyDataContext__initialization__dir0", "plugins",""]
コード例 #21
0
def test_dataset_from_pandas_source(tmpdir):
    data_file = tmpdir + '/data.json'
    json_data = [
        {"name": "my name", "birthdate": "2020-10-01", "address": "1234 Main st", "size": 12},
        {"name": "your name", "birthdate": "2020-06-01", "address": "1313 Mockingbird Ln",
         "size": 12}
    ]
    with open(data_file, mode='w') as out:
        json.dump(json_data, out)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    pd_dataset = PandasDataset(pandas.read_json(data_file),
                               **{'batch_kwargs': {'path': 'gcs://my_bucket/path/to/my/data'},
                                  'data_context': ctx})
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')

    datasets = action._fetch_datasets_from_pandas_source(pd_dataset,
                                                         validation_result_suite=result_suite)
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == '/path/to/my/data'
    assert input_ds.namespace == "gcs://my_bucket"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "gcs://my_bucket"
    assert input_ds.facets["dataSource"].uri == 'gcs://my_bucket'

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'object'),
                         SchemaField('birthdate', 'object'),
                         SchemaField('address', 'object'),
                         SchemaField('size', 'int64')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
コード例 #22
0
def test__normalize_absolute_or_relative_path(tmp_path_factory,
                                              basic_data_context_v013_config):
    config_path = str(
        tmp_path_factory.mktemp(
            "test__normalize_absolute_or_relative_path__dir"))
    context = BaseDataContext(
        basic_data_context_v013_config,
        config_path,
    )

    pattern_string = os.path.join(
        "^.*test__normalize_absolute_or_relative_path__dir\\d*", "yikes$")
    pattern = re.compile(pattern_string)
    assert (pattern.match(
        context._normalize_absolute_or_relative_path("yikes")) is not None)

    assert ("test__normalize_absolute_or_relative_path__dir"
            not in context._normalize_absolute_or_relative_path("/yikes"))
    assert "/yikes" == context._normalize_absolute_or_relative_path("/yikes")
コード例 #23
0
def test_load_config_variables_file(basic_data_context_config,
                                    tmp_path_factory):
    # Setup:
    base_path = str(tmp_path_factory.mktemp('test_load_config_variables_file'))
    safe_mmkdir(os.path.join(base_path, "uncommitted"))
    with open(os.path.join(base_path, "uncommitted", "dev_variables.yml"),
              "w") as outfile:
        yaml.dump({'env': 'dev'}, outfile)
    with open(os.path.join(base_path, "uncommitted", "prod_variables.yml"),
              "w") as outfile:
        yaml.dump({'env': 'prod'}, outfile)
    basic_data_context_config[
        "config_variables_file_path"] = "uncommitted/${TEST_CONFIG_FILE_ENV}_variables.yml"

    try:
        # We should be able to load different files based on an environment variable
        os.environ["TEST_CONFIG_FILE_ENV"] = "dev"
        context = BaseDataContext(basic_data_context_config,
                                  context_root_dir=base_path)
        config_vars = context._load_config_variables_file()
        assert config_vars['env'] == 'dev'
        os.environ["TEST_CONFIG_FILE_ENV"] = "prod"
        context = BaseDataContext(basic_data_context_config,
                                  context_root_dir=base_path)
        config_vars = context._load_config_variables_file()
        assert config_vars['env'] == 'prod'
    except Exception:
        raise
    finally:
        # Make sure we unset the environment variable we're using
        del os.environ["TEST_CONFIG_FILE_ENV"]
コード例 #24
0
def test_load_config_variables_file(basic_data_context_v013_config,
                                    tmp_path_factory, monkeypatch):
    # Setup:
    base_path = str(tmp_path_factory.mktemp("test_load_config_variables_file"))
    os.makedirs(os.path.join(base_path, "uncommitted"), exist_ok=True)
    with open(os.path.join(base_path, "uncommitted", "dev_variables.yml"),
              "w") as outfile:
        yaml.dump({"env": "dev"}, outfile)
    with open(os.path.join(base_path, "uncommitted", "prod_variables.yml"),
              "w") as outfile:
        yaml.dump({"env": "prod"}, outfile)
    basic_data_context_v013_config[
        "config_variables_file_path"] = "uncommitted/${TEST_CONFIG_FILE_ENV}_variables.yml"

    try:
        # We should be able to load different files based on an environment variable
        monkeypatch.setenv("TEST_CONFIG_FILE_ENV", "dev")
        context = BaseDataContext(basic_data_context_v013_config,
                                  context_root_dir=base_path)
        config_vars = context._load_config_variables_file()
        assert config_vars["env"] == "dev"
        monkeypatch.setenv("TEST_CONFIG_FILE_ENV", "prod")
        context = BaseDataContext(basic_data_context_v013_config,
                                  context_root_dir=base_path)
        config_vars = context._load_config_variables_file()
        assert config_vars["env"] == "prod"
    except Exception:
        raise
    finally:
        # Make sure we unset the environment variable we're using
        monkeypatch.delenv("TEST_CONFIG_FILE_ENV")
コード例 #25
0
def test_DataContext_construct_data_context_id_uses_id_stored_in_DataContextConfig_if_no_configured_expectations_store(
    monkeypatch,
):
    """
    What does this test and why?

    A DataContext should have an id. This ID should come from either:
    1. configured expectations store store_backend_id
    2. great_expectations.yml
    3. new generated id from DataContextConfig
    This test verifies that DataContext._construct_data_context_id
    uses the data_context_id from DataContextConfig when there is no configured expectations store
    when instantiating the DataContext,
    and also that this data_context_id is used to configure the expectations_store.store_backend_id
    """
    monkeypatch.delenv(
        "GE_USAGE_STATS", raising=False
    )  # Undo the project-wide test default

    bucket = "leakybucket"
    expectations_store_prefix = "expectations_store_prefix"
    validations_store_prefix = "validations_store_prefix"
    data_docs_store_prefix = "data_docs_store_prefix"
    manually_created_uuid = "00000000-0000-0000-0000-000000000eee"

    # Create a bucket in Moto's mock AWS environment
    conn = boto3.resource("s3", region_name="us-east-1")
    conn.create_bucket(Bucket=bucket)

    # Create a DataContext (note NO existing expectations store already set up)
    in_code_data_context_project_config = build_in_code_data_context_project_config(
        bucket="leakybucket",
        expectations_store_prefix=expectations_store_prefix,
        validations_store_prefix=validations_store_prefix,
        data_docs_store_prefix=data_docs_store_prefix,
    )
    # Manually set the data_context_id in the project_config
    in_code_data_context_project_config.anonymous_usage_statistics.data_context_id = (
        manually_created_uuid
    )
    in_code_data_context = BaseDataContext(
        project_config=in_code_data_context_project_config
    )

    # Make sure the manually set data_context_id is propagated to all the appropriate places
    assert (
        manually_created_uuid
        == in_code_data_context.data_context_id
        == in_code_data_context.stores[
            in_code_data_context.expectations_store_name
        ].store_backend_id
    )
コード例 #26
0
def build_in_memory_runtime_context():
    data_context_config: DataContextConfig = DataContextConfig(
        datasources={
            "pandas_datasource": {
                "execution_engine": {
                    "class_name": "PandasExecutionEngine",
                    "module_name": "great_expectations.execution_engine",
                },
                "class_name": "Datasource",
                "module_name": "great_expectations.datasource",
                "data_connectors": {
                    "runtime_data_connector": {
                        "class_name": "RuntimeDataConnector",
                        "batch_identifiers": [
                            "id_key_0",
                            "id_key_1",
                        ],
                    }
                },
            },
            "spark_datasource": {
                "execution_engine": {
                    "class_name": "SparkDFExecutionEngine",
                    "module_name": "great_expectations.execution_engine",
                },
                "class_name": "Datasource",
                "module_name": "great_expectations.datasource",
                "data_connectors": {
                    "runtime_data_connector": {
                        "class_name": "RuntimeDataConnector",
                        "batch_identifiers": [
                            "id_key_0",
                            "id_key_1",
                        ],
                    }
                },
            },
        },
        expectations_store_name="expectations_store",
        validations_store_name="validations_store",
        evaluation_parameter_store_name="evaluation_parameter_store",
        checkpoint_store_name="checkpoint_store",
        store_backend_defaults=InMemoryStoreBackendDefaults(),
    )

    context: BaseDataContext = BaseDataContext(
        project_config=data_context_config)

    return context
コード例 #27
0
def test_consistent_name_anonymization(
        in_memory_data_context_config_usage_stats_enabled):
    context = BaseDataContext(
        in_memory_data_context_config_usage_stats_enabled)
    assert context.data_context_id == "6a52bdfa-e182-455b-a825-e69f076e67d6"
    payload = run_validation_operator_usage_statistics(
        context,
        "action_list_operator",
        assets_to_validate=[({
            "__fake_batch_kwargs": "mydatasource"
        }, "__fake_expectation_suite_name")],
        run_id="foo")
    # For a *specific* data_context_id, all names will be consistently anonymized
    assert payload[
        "anonymized_operator_name"] == '5bb011891aa7d41401e57759d5f5cb01'
コード例 #28
0
def test_dataset_from_sql_source(test_db_file, tmpdir):
    connection_url = f'sqlite:///{test_db_file}'
    engine = create_engine(connection_url)

    ds = SqlAlchemyDataset(table_name=TABLE_NAME, engine=engine)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')
    datasets = action._fetch_datasets_from_sql_source(ds, result_suite)
    assert datasets is not None
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == TABLE_NAME
    assert input_ds.namespace == "sqlite"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "sqlite"
    assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('birthdate', 'TEXT'),
                         SchemaField('address', 'TEXT'),
                         SchemaField('size', 'INTEGER')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
コード例 #29
0
    def __init__(self,
                 *,
                 gcp_project,
                 expectation_suite_name,
                 gcs_bucket,
                 gcs_expectations_prefix,
                 gcs_validations_prefix,
                 gcs_datadocs_prefix,
                 query=None,
                 table=None,
                 bq_dataset_name,
                 email_to,
                 datadocs_domain='none',
                 send_alert_email=True,
                 datadocs_link_in_email=False,
                 fail_task_on_validation_failure=True,
                 bigquery_conn_id='bigquery_default',
                 **kwargs):

        self.query = query
        self.table = table
        self.bigquery_conn_id = bigquery_conn_id
        self.bq_dataset_name = bq_dataset_name
        self.email_to = email_to
        self.gcp_project = gcp_project
        self.gcs_bucket = gcs_bucket
        self.gcs_expectations_prefix = gcs_expectations_prefix
        self.gcs_validations_prefix = gcs_validations_prefix
        self.gcs_datadocs_prefix = gcs_datadocs_prefix
        self.datadocs_domain = datadocs_domain
        self.send_alert_email = send_alert_email
        self.datadocs_link_in_email = datadocs_link_in_email
        self.fail_task_on_validation_failure = fail_task_on_validation_failure

        # Create a data context and batch_kwargs that will then be handed off to the base operator to do the
        # data validation against Expectations.
        data_context_config = self.create_data_context_config()
        data_context = BaseDataContext(project_config=data_context_config)
        batch_kwargs = self.get_batch_kwargs()
        # Call the parent constructor but override the default alerting behavior in the parent by hard coding
        # fail_task_on_validation_failure=False.  This is done because we want to alert a little differently
        # than the parent class by sending an email to the user and then throwing an Airflow exception whenever
        # data doesn't match Expectations.
        super().__init__(data_context=data_context,
                         batch_kwargs=batch_kwargs,
                         expectation_suite_name=expectation_suite_name,
                         fail_task_on_validation_failure=False,
                         **kwargs)
コード例 #30
0
    def validate_with_great_expectations(
        self,
        dataframe: TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
        expectation_suite: TypeVar("ge.core.ExpectationSuite"),  # noqa: F821
        ge_validate_kwargs: Optional[dict],
    ):
        # NOTE: InMemoryStoreBackendDefaults SHOULD NOT BE USED in normal settings. You
        # may experience data loss as it persists nothing. It is used here for testing.
        # Please refer to docs to learn how to instantiate your DataContext.
        store_backend_defaults = InMemoryStoreBackendDefaults()
        data_context_config = DataContextConfig(
            store_backend_defaults=store_backend_defaults,
            checkpoint_store_name=store_backend_defaults.checkpoint_store_name,
        )
        context = BaseDataContext(project_config=data_context_config)

        datasource = {
            "name": "my_spark_dataframe",
            "class_name": "Datasource",
            "execution_engine": {
                "class_name": "SparkDFExecutionEngine",
                "force_reuse_spark_context": True,
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["batch_id"],
                }
            },
        }
        context.add_datasource(**datasource)

        # Here is a RuntimeBatchRequest using a dataframe
        batch_request = RuntimeBatchRequest(
            datasource_name="my_spark_dataframe",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name=
            "<YOUR_MEANGINGFUL_NAME>",  # This can be anything that identifies this data_asset for you
            batch_identifiers={"batch_id": "default_identifier"},
            runtime_parameters={"batch_data":
                                dataframe},  # Your dataframe goes here
        )
        context.save_expectation_suite(expectation_suite)
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite.expectation_suite_name,
        )
        report = validator.validate(**ge_validate_kwargs)

        return report