def test_opt_out_env_var_overrides_yml(tmp_path_factory, monkeypatch):
    monkeypatch.delenv(
        "GE_USAGE_STATS", raising=False
    )  # Undo the project-wide test default
    project_path = str(tmp_path_factory.mktemp("data_context"))
    context_path = os.path.join(project_path, "great_expectations")
    os.makedirs(context_path, exist_ok=True)
    fixture_dir = file_relative_path(__file__, "../../test_fixtures")

    shutil.copy(
        os.path.join(
            fixture_dir, "great_expectations_basic_with_usage_stats_enabled.yml"
        ),
        str(os.path.join(context_path, "great_expectations.yml")),
    )

    assert (
        DataContext(
            context_root_dir=context_path
        )._project_config.anonymous_usage_statistics.enabled
        is True
    )

    monkeypatch.setenv("GE_USAGE_STATS", "False")
    context = DataContext(context_root_dir=context_path)
    project_config = context._project_config
    assert project_config.anonymous_usage_statistics.enabled is False
def test_data_context_do_all_uncommitted_dirs_exist(tmp_path_factory):
    expected = """\
uncommitted/
    config_variables.yml
    data_docs/
    samples/
    validations/
"""
    project_path = str(tmp_path_factory.mktemp('stuff'))
    ge_dir = os.path.join(project_path, "great_expectations")
    uncommitted_dir = os.path.join(ge_dir, "uncommitted")
    DataContext.create(project_path)
    fixture = gen_directory_tree_str(uncommitted_dir)
    print(fixture)
    assert fixture == expected

    # Test that all exist
    assert DataContext.all_uncommitted_directories_exist(ge_dir)

    # remove a few
    shutil.rmtree(os.path.join(uncommitted_dir, "data_docs"))
    shutil.rmtree(os.path.join(uncommitted_dir, "validations"))

    # Test that not all exist
    assert not DataContext.all_uncommitted_directories_exist(project_path)
Beispiel #3
0
def profile(datasource_name, data_assets, profile_all_data_assets, directory):
    """
    Profile datasources from the specified context.

    If the optional data_assets and profile_all_data_assets arguments are not specified, the profiler will check
    if the number of data assets in the datasource exceeds the internally defined limit. If it does, it will
    prompt the user to either specify the list of data assets to profile or to profile all.
    If the limit is not exceeded, the profiler will profile all data assets in the datasource.

    :param datasource_name: name of the datasource to profile
    :param data_assets: if this comma-separated list of data asset names is provided, only the specified data assets will be profiled
    :param profile_all_data_assets: if provided, all data assets will be profiled
    :param directory:
    :return:
    """

    try:
        context = DataContext(directory)
    except ConfigNotFoundError:
        cli_message("Error: no great_expectations context configuration found in the specified directory.")
        return

    if datasource_name is None:
        datasources = [datasource["name"] for datasource in context.list_datasources()]
        if len(datasources) > 1:
            cli_message("Error: please specify the datasource to profile. Available datasources: " + ", ".join(datasources))
            return
        else:
            profile_datasource(context, datasources[0], data_assets=data_assets, profile_all_data_assets=profile_all_data_assets)
    else:
        profile_datasource(context, datasource_name, data_assets=data_assets, profile_all_data_assets=profile_all_data_assets)
def test_opt_out_etc_overrides_yml(tmp_path_factory, monkeypatch):
    monkeypatch.delenv("GE_USAGE_STATS", raising=False)  # Undo the project-wide test default
    home_config_dir = tmp_path_factory.mktemp("home_dir")
    home_config_dir = str(home_config_dir)
    etc_config_dir = tmp_path_factory.mktemp("etc")
    etc_config_dir = str(etc_config_dir)
    config_dirs = [home_config_dir, etc_config_dir]
    config_dirs = [
        os.path.join(config_dir, "great_expectations.conf") for config_dir in config_dirs
    ]

    disabled_config = configparser.ConfigParser()
    disabled_config["anonymous_usage_statistics"] = {"enabled": "False"}

    with open(os.path.join(etc_config_dir, "great_expectations.conf"), 'w') as configfile:
        disabled_config.write(configfile)

    project_path = str(tmp_path_factory.mktemp("data_context"))
    context_path = os.path.join(project_path, "great_expectations")
    os.makedirs(context_path, exist_ok=True)
    fixture_dir = file_relative_path(__file__, "../../test_fixtures")

    shutil.copy(
        os.path.join(fixture_dir, "great_expectations_basic_with_usage_stats_enabled.yml"),
        str(os.path.join(context_path, "great_expectations.yml")),
    )

    assert DataContext(context_root_dir=context_path)._project_config.anonymous_usage_statistics.enabled is True

    with mock.patch("great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS", config_dirs):
        context = DataContext(context_root_dir=context_path)
        project_config = context._project_config
        assert project_config.anonymous_usage_statistics.enabled is False
Beispiel #5
0
def profile(datasource_name, data_assets, profile_all_data_assets, directory):
    """Profile datasources from the specified context.


    DATASOURCE_NAME: the datasource to profile, or leave blank to profile all datasources."""

    if profile_all_data_assets:
        max_data_assets = None

    try:
        context = DataContext(directory)
    except ConfigNotFoundError:
        cli_message(
            "Error: no great_expectations context configuration found in the specified directory."
        )
        return

    if datasource_name is None:
        datasources = [
            datasource["name"] for datasource in context.list_datasources()
        ]
        for datasource_name in datasources:
            profile_datasource(context,
                               datasource_name,
                               max_data_assets=max_data_assets)
    else:
        profile_datasource(context, datasource_name, data_assets=data_assets)
Beispiel #6
0
def test_run_profiler_on_data_emits_proper_usage_stats(
    mock_emit: mock.MagicMock,
    mock_profiler_run: mock.MagicMock,
    empty_data_context_stats_enabled: DataContext,
    populated_profiler_store: ProfilerStore,
    profiler_name: str,
):
    with mock.patch(
            "great_expectations.data_context.DataContext.profiler_store"
    ) as mock_profiler_store:
        mock_profiler_store.__get__ = mock.Mock(
            return_value=populated_profiler_store)
        empty_data_context_stats_enabled.run_profiler_on_data(
            name=profiler_name,
            batch_request={
                "datasource_name": "my_datasource",
                "data_connector_name": "my_data_connector",
                "data_asset_name": "my_data_asset",
            },
        )

    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event_payload": {},
            "event": "data_context.run_profiler_on_data",
            "success": True,
        })
    ]
def test_data_context_create_does_nothing_if_all_uncommitted_dirs_exist(
        tmp_path_factory):
    expected = """\
great_expectations/
    .gitignore
    great_expectations.yml
    datasources/
    expectations/
    notebooks/
        create_expectations.ipynb
        integrate_validation_into_pipeline.ipynb
    plugins/
    uncommitted/
        config_variables.yml
        data_docs/
        samples/
        validations/
"""
    project_path = str(tmp_path_factory.mktemp('stuff'))
    ge_dir = os.path.join(project_path, "great_expectations")

    DataContext.create(project_path)
    fixture = gen_directory_tree_str(ge_dir)
    assert fixture == expected

    # re-run create to simulate onboarding
    DataContext.create(project_path)

    obs = gen_directory_tree_str(ge_dir)
    assert obs == expected
Beispiel #8
0
def profile(datasource_name, data_assets, profile_all_data_assets, directory, view, batch_kwargs):
    """
    Profile datasources from the specified context.

    If the optional data_assets and profile_all_data_assets arguments are not specified, the profiler will check
    if the number of data assets in the datasource exceeds the internally defined limit. If it does, it will
    prompt the user to either specify the list of data assets to profile or to profile all.
    If the limit is not exceeded, the profiler will profile all data assets in the datasource.

    :param datasource_name: name of the datasource to profile
    :param data_assets: if this comma-separated list of data asset names is provided, only the specified data assets will be profiled
    :param profile_all_data_assets: if provided, all data assets will be profiled
    :param directory:
    :param view: Open the docs in a browser
    :param batch_kwargs: Additional keyword arguments to be provided to get_batch when loading the data asset.
    :return:
    """

    try:
        context = DataContext(directory)
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message("<red>{}</red>".format(err.message))
        return
    except ge_exceptions.ZeroDotSevenConfigVersionError as err:
        _offer_to_install_new_template(err, context.root_directory)
        return

    if batch_kwargs is not None:
        batch_kwargs = json.loads(batch_kwargs)

    if datasource_name is None:
        datasources = [datasource["name"] for datasource in context.list_datasources()]
        if not datasources:
            cli_message(NO_DATASOURCES_FOUND)
            sys.exit(-1)
        elif len(datasources) > 1:
            cli_message(
                "<red>Error: please specify the datasource to profile. "\
                "Available datasources: " + ", ".join(datasources) + "</red>"
            )
            sys.exit(-1)
        else:
            profile_datasource(
                context,
                datasources[0],
                data_assets=data_assets,
                profile_all_data_assets=profile_all_data_assets,
                open_docs=view,
                additional_batch_kwargs=batch_kwargs
            )
    else:
        profile_datasource(
            context,
            datasource_name,
            data_assets=data_assets,
            profile_all_data_assets=profile_all_data_assets,
            open_docs=view,
            additional_batch_kwargs=batch_kwargs
        )
Beispiel #9
0
def validate(data_asset,
             expectation_suite=None,
             data_asset_name=None,
             data_context=None,
             data_asset_type=None,
             *args,
             **kwargs):
    """Validate the provided data asset using the provided expectation suite"""
    if expectation_suite is None and data_context is None:
        raise ValueError(
            "Either an expectation suite or a DataContext is required for validation."
        )

    if expectation_suite is None:
        logger.info("Using expectation suite from DataContext.")
        # Allow data_context to be a string, and try loading it from path in that case
        if isinstance(data_context, string_types):
            data_context = DataContext(data_context)
        expectation_suite = data_context.get_expectation_suite(data_asset_name)
    else:
        if data_asset_name in expectation_suite:
            logger.info("Using expectation suite with name %s" %
                        expectation_suite["data_asset_name"])
        else:
            logger.info("Using expectation suite with no data_asset_name")

    # If the object is already a Dataset type, then this is purely a convenience method
    # and no conversion is needed
    if isinstance(data_asset, dataset.Dataset) and data_asset_type is None:
        return data_asset.validate(expectation_suite=expectation_suite,
                                   data_context=data_context,
                                   *args,
                                   **kwargs)
    elif data_asset_type is None:
        # Guess the GE data_asset_type based on the type of the data_asset
        if isinstance(data_asset, pd.DataFrame):
            data_asset_type = dataset.PandasDataset
        # Add other data_asset_type conditions here as needed

    # Otherwise, we will convert for the user to a subclass of the
    # existing class to enable new expectations, but only for datasets
    if not isinstance(data_asset, (dataset.Dataset, pd.DataFrame)):
        raise ValueError(
            "The validate util method only supports dataset validations, including custom subclasses. For other data asset types, use the object's own validate method."
        )

    if not issubclass(type(data_asset), data_asset_type):
        if isinstance(data_asset, (pd.DataFrame)) and issubclass(
                data_asset_type, dataset.PandasDataset):
            pass  # This is a special type of allowed coercion
        else:
            raise ValueError(
                "The validate util method only supports validation for subtypes of the provided data_asset_type."
            )

    data_asset_ = _convert_to_dataset_class(data_asset, data_asset_type,
                                            expectation_suite)
    return data_asset_.validate(*args, data_context=data_context, **kwargs)
def empty_context(tmp_path_factory):
    project_path = str(tmp_path_factory.mktemp('data_context'))
    DataContext.create(project_path)
    ge_dir = os.path.join(project_path, "great_expectations")
    assert os.path.isdir(ge_dir)
    assert os.path.isfile(os.path.join(ge_dir, DataContext.GE_YML))
    context = DataContext(ge_dir)
    assert isinstance(context, DataContext)
    return context
Beispiel #11
0
def init(target_directory, view):
    """
    Create a new project and help with onboarding.

    This guided input walks the user through setting up a new project and also
    onboards a new developer in an existing project.

    It scaffolds directories, sets up notebooks, creates a project file, and
    appends to a `.gitignore` file.
    """
    target_directory = os.path.abspath(target_directory)
    ge_dir = _get_full_path_to_ge_dir(target_directory)
    ge_yml = os.path.join(ge_dir, DataContext.GE_YML)

    cli_message(GREETING)

    # TODO this should be a property
    if os.path.isfile(ge_yml):
        if DataContext.all_uncommitted_directories_exist(ge_dir) and \
                DataContext.config_variables_yml_exist(ge_dir):
            # Ensure the context can be instantiated
            try:
                _ = DataContext(ge_dir)
                cli_message(PROJECT_IS_COMPLETE)
            except ge_exceptions.DataContextError as e:
                cli_message("<red>{}</red>".format(e))
                exit(5)
        else:
            _complete_onboarding(target_directory)

        try:
            # if expectations exist, offer to build docs
            context = DataContext(ge_dir)
            if context.list_expectation_suite_keys():
                if click.confirm(BUILD_DOCS_PROMPT, default=True):
                    context.build_data_docs()
                    context.open_data_docs()
        except ge_exceptions.DataContextError as e:
            cli_message("<red>{}</red>".format(e))
    else:
        if not click.confirm(LETS_BEGIN_PROMPT, default=True):
            cli_message(RUN_INIT_AGAIN)
            exit(0)

        context, data_source_name, data_source_type = _create_new_project(
            target_directory)
        if not data_source_name:  # no datasource was created
            return

        profile_datasource(context,
                           data_source_name,
                           open_docs=view,
                           additional_batch_kwargs={"limit": 1000})
        cli_message(
            """\n<cyan>Great Expectations is now set up in your project!</cyan>"""
        )
Beispiel #12
0
def test_list_profilers_raises_configuration_error(
        empty_data_context: DataContext):
    with mock.patch(
            "great_expectations.data_context.DataContext.profiler_store",
    ) as mock_profiler_store:
        mock_profiler_store.__get__ = mock.Mock(return_value=None)
        with pytest.raises(ge_exceptions.StoreConfigurationError) as e:
            empty_data_context.list_profilers()

    assert "not a configured store" in str(e.value)
def test_existing_local_data_docs_urls_returns_url_on_project_with_no_datasources_and_a_site_configured(tmp_path_factory):
    """
    This test ensures that a url will be returned for a default site even if a
    datasource is not configured, and docs are not built.
    """
    empty_directory = str(tmp_path_factory.mktemp("another_empty_project"))
    DataContext.create(empty_directory)
    context = DataContext(os.path.join(empty_directory, DataContext.GE_DIR))

    obs = context.get_docs_sites_urls()
    assert len(obs) == 1
    assert obs[0].endswith("great_expectations/uncommitted/data_docs/local_site/index.html")
Beispiel #14
0
def list_datasources(directory):
    """List known datasources."""
    try:
        context = DataContext(directory)
        datasources = context.list_datasources()
        # TODO Pretty up this console output
        cli_message(str([d for d in datasources]))
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message("<red>{}</red>".format(err.message))
        return
    except ge_exceptions.ZeroDotSevenConfigVersionError as err:
        _offer_to_install_new_template(err, context.root_directory)
Beispiel #15
0
def profile(datasource_name, max_data_assets, profile_all_data_assets,
            target_directory):
    """Profile a great expectations object.

    datasource_name: A datasource within this GE context to profile.
    """

    if profile_all_data_assets:
        max_data_assets = None

    # FIXME: By default, this should iterate over all datasources
    context = DataContext(target_directory)
    context.profile_datasource(datasource_name,
                               max_data_assets=max_data_assets)
def test_scaffold_directories_and_notebooks(tmp_path_factory):
    empty_directory = str(
        tmp_path_factory.mktemp("test_scaffold_directories_and_notebooks"))
    DataContext.scaffold_directories(empty_directory)
    DataContext.scaffold_notebooks(empty_directory)

    assert set(os.listdir(empty_directory)) == {
        'plugins', 'expectations', '.gitignore', 'uncommitted', 'notebooks'
    }
    assert set(os.listdir(os.path.join(
        empty_directory, "uncommitted"))) == {'data_docs', 'validations'}
    for subdir in DataContext.NOTEBOOK_SUBDIRECTORIES:
        subdir_path = os.path.join(empty_directory, "notebooks", subdir)
        assert set(os.listdir(subdir_path)) == {"validation_playground.ipynb"}
def test_data_context_create_raises_warning_and_leaves_existing_yml_untouched(
        tmp_path_factory):
    project_path = str(tmp_path_factory.mktemp('data_context'))
    DataContext.create(project_path)
    ge_yml = os.path.join(project_path,
                          "great_expectations/great_expectations.yml")
    with open(ge_yml, "a") as ff:
        ff.write("# LOOK I WAS MODIFIED")

    with pytest.warns(UserWarning):
        DataContext.create(project_path)

    with open(ge_yml, "r") as ff:
        obs = ff.read()
    assert "# LOOK I WAS MODIFIED" in obs
Beispiel #18
0
def test_add_profiler_with_invalid_config_raises_error(
    empty_data_context: DataContext,
    profiler_config_with_placeholder_args: RuleBasedProfilerConfig,
):
    args = profiler_config_with_placeholder_args.to_json_dict()
    for attr in ("class_name", "module_name"):
        args.pop(attr, None)

    # Setting invalid configuration to check that it is caught by DataContext wrapper method
    args["config_version"] = -1

    with pytest.raises(ValidationError) as e:
        empty_data_context.add_profiler(**args)

    assert "config_version" in str(e.value)
Beispiel #19
0
def init(target_directory):
    """Initialize a new Great Expectations project.

    This guided input walks the user through setting up a project.

    It scaffolds directories, sets up notebooks, creates a project file, and
    appends to a `.gitignore` file.
    """
    try:
        context = DataContext.create(target_directory)
    except DataContextError as err:
        logger.critical(err.message)
        sys.exit(-1)

    base_dir = context.root_directory

    six.print_(
        colored(figlet_format("Great Expectations", font="big"), color="cyan"))

    cli_message(greeting_1)

    if not click.confirm(msg_prompt_lets_begin, default=True):
        cli_message(
            "OK - run great_expectations init again when ready. Exiting...")
        exit(0)

    scaffold_directories_and_notebooks(base_dir)
    cli_message("\nDone.", )

    add_datasource(context)
def test_data_context_is_project_initialized_returns_false_when_uncommitted_dir_is_missing(
        empty_context):
    ge_dir = empty_context.root_directory
    # mangle project
    shutil.rmtree(os.path.join(ge_dir, empty_context.GE_UNCOMMITTED_DIR))

    assert DataContext.is_project_initialized(ge_dir) == False
def test_data_context_is_project_initialized_returns_false_when_config_yml_is_missing(
        empty_context):
    ge_dir = empty_context.root_directory
    # mangle project
    safe_remove(os.path.join(ge_dir, empty_context.GE_YML))

    assert DataContext.is_project_initialized(ge_dir) == False
def test_data_context_does_project_have_a_datasource_in_config_file_returns_false_when_the_project_has_an_invalid_config_file(
    empty_context, ):
    ge_dir = empty_context.root_directory
    with open(os.path.join(ge_dir, DataContext.GE_YML), "w") as yml:
        yml.write("this file: is not a valid ge config")
    assert DataContext.does_project_have_a_datasource_in_config_file(
        ge_dir) == False
def test_data_context_does_project_have_a_datasource_in_config_file_returns_true_when_it_has_a_datasource_configured_in_yml_file_on_disk(
    empty_context, ):
    ge_dir = empty_context.root_directory
    empty_context.add_datasource("arthur",
                                 **{"class_name": "PandasDatasource"})
    assert DataContext.does_project_have_a_datasource_in_config_file(
        ge_dir) == True
def test_DataContext_raises_error_on_old_config_version():
    local_dir = file_relative_path(
        __file__, os.path.join(BASE_DIR, "old_config_version"))
    with pytest.raises(ge_exceptions.InvalidDataContextConfigError) as exc:
        DataContext(local_dir)

    assert "Error while processing DataContextConfig" in exc.value.message
def test_data_context_does_ge_yml_exist_returns_false_when_it_does_not_exist(
    empty_context,
):
    ge_dir = empty_context.root_directory
    # mangle project
    safe_remove(os.path.join(ge_dir, empty_context.GE_YML))
    assert DataContext.does_config_exist_on_disk(ge_dir) == False
def test_DataContext_raises_error_on_invalid_top_level_type():
    local_dir = file_relative_path(
        __file__, os.path.join(BASE_DIR, "invalid_top_level_value_type"))
    with pytest.raises(ge_exceptions.InvalidDataContextConfigError) as exc:
        DataContext(local_dir)

    assert "data_docs_sites" in exc.value.messages
def test_data_context_create_makes_uncommitted_dirs_when_all_are_missing(
        tmp_path_factory):
    project_path = str(tmp_path_factory.mktemp('data_context'))
    DataContext.create(project_path)

    # mangle the existing setup
    ge_dir = os.path.join(project_path, "great_expectations")
    uncommitted_dir = os.path.join(ge_dir, "uncommitted")
    shutil.rmtree(uncommitted_dir)

    # re-run create to simulate onboarding
    DataContext.create(project_path)
    obs = gen_directory_tree_str(ge_dir)

    assert os.path.isdir(uncommitted_dir), "No uncommitted directory created"
    assert obs == """\
def test_scaffold_directories_and_notebooks(tmp_path_factory):
    empty_directory = str(
        tmp_path_factory.mktemp("test_scaffold_directories_and_notebooks"))
    DataContext.scaffold_directories(empty_directory)
    DataContext.scaffold_notebooks(empty_directory)

    assert set(os.listdir(empty_directory)) == {
        'datasources', 'plugins', 'expectations', '.gitignore', 'uncommitted',
        'notebooks'
    }
    assert set(os.listdir(os.path.join(empty_directory, "uncommitted"))) == {
        'samples', 'data_docs', 'validations'
    }
    assert set(os.listdir(os.path.join(empty_directory, "notebooks"))) == {
        "create_expectations.ipynb", "integrate_validation_into_pipeline.ipynb"
    }
def test_data_context_ge_cloud_mode_makes_successful_request_to_cloud_api(
    mock_request,
    ge_cloud_runtime_base_url,
    ge_cloud_runtime_organization_id,
    ge_cloud_runtime_access_token,
):
    # Ensure that the request goes through
    mock_request.return_value.status_code = 200
    try:
        DataContext(
            ge_cloud_mode=True,
            ge_cloud_base_url=ge_cloud_runtime_base_url,
            ge_cloud_organization_id=ge_cloud_runtime_organization_id,
            ge_cloud_access_token=ge_cloud_runtime_access_token,
        )
    except:  # Not concerned with constructor output (only evaluating interaction with requests during __init__)
        pass

    called_with_url = f"{ge_cloud_runtime_base_url}/organizations/{ge_cloud_runtime_organization_id}/data-context-configuration"
    called_with_header = {
        "headers": {
            "Content-Type": "application/vnd.api+json",
            "Authorization": f"Bearer {ge_cloud_runtime_access_token}",
        }
    }

    # Only ever called once with the endpoint URL and auth token as args
    mock_request.assert_called_once()
    assert mock_request.call_args[0][0] == called_with_url
    assert mock_request.call_args[1] == called_with_header
def test_datasource_initialization_error_thrown_in_cloud_mode(
    ge_cloud_data_context_config: DataContextConfig,
    ge_cloud_runtime_base_url,
    ge_cloud_runtime_organization_id,
    ge_cloud_runtime_access_token,
):
    # normally the DataContext swallows exceptions when there is an error raised from get_datasource
    # (which is used during initialization). In cloud mode, we want a DatasourceInitializationError to
    # propogate.

    # normally in cloud mode configuration is retrieved from an endpoint; we're providing it here in-line
    with mock.patch(
        "great_expectations.data_context.DataContext._retrieve_data_context_config_from_ge_cloud",
        return_value=ge_cloud_data_context_config,
    ):
        # DataContext._init_datasources calls get_datasource, which may generate a DatasourceInitializationError
        # that normally gets swallowed.
        with mock.patch(
            "great_expectations.data_context.DataContext.get_datasource"
        ) as get_datasource:
            get_datasource.side_effect = DatasourceInitializationError(
                "mock_datasource", "mock_message"
            )
            with pytest.raises(DatasourceInitializationError):
                DataContext(
                    ge_cloud_mode=True,
                    ge_cloud_base_url=ge_cloud_runtime_base_url,
                    ge_cloud_organization_id=ge_cloud_runtime_organization_id,
                    ge_cloud_access_token=ge_cloud_runtime_access_token,
                )