コード例 #1
0
saves validation results to your results store and then updates Data Docs.

This makes viewing validation results easy for you and your team.

Usage:
- Run this file: `python {0}`.
- This can be run manually or via a scheduler such as cron.
- If your pipeline runner supports python snippets you can paste this into your
pipeline.
"""
import sys

from great_expectations import DataContext

# tap configuration
context = DataContext("{1}")
suite = context.get_expectation_suite("{2}")
# You can modify your BatchKwargs to select different data
batch_kwargs = {3}

# tap validation process
batch = context.get_batch(batch_kwargs, suite)
results = context.run_validation_operator("action_list_operator", [batch])

if not results["success"]:
    print("Validation Failed!")
    sys.exit(1)

print("Validation Succeeded!")
sys.exit(0)
コード例 #2
0
def test_cli_init_on_new_project_extra_whitespace_in_url(
    mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file
):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = create_engine("sqlite:///{}".format(database_path))
    engine_url_with_added_whitespace = "    " + str(engine.url) + "  "

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="\n\n2\n6\ntitanic\n{}\n\n\n1\nwarning\n\n\n\n".format(
            engine_url_with_added_whitespace
        ),
        catch_exceptions=False,
    )
    stdout = result.output
    assert len(stdout) < 6000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new Datasource a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Name the new Expectation Suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout
    )
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [
        {
            "class_name": "SqlAlchemyDatasource",
            "name": "titanic",
            "module_name": "great_expectations.datasource",
            "credentials": {"url": str(engine.url)},
            "data_asset_type": {
                "class_name": "SqlAlchemyDataset",
                "module_name": "great_expectations.dataset",
            },
        }
    ]

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 14

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"
    ]
    assert data_source_class == "SqlAlchemyDataset"

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )
コード例 #3
0
def test_cli_init_on_new_project(
    mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file
):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = create_engine("sqlite:///{}".format(database_path))

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="\n\n2\n6\ntitanic\n{}\n\n\n1\nwarning\n\n\n\n".format(engine.url),
        catch_exceptions=False,
    )
    stdout = result.output
    assert len(stdout) < 6000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new Datasource a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Name the new Expectation Suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout
    )
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources()[0]["class_name"] == "SqlAlchemyDatasource"
    assert context.list_datasources()[0]["name"] == "titanic"

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 14

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"
    ]
    assert data_source_class == "SqlAlchemyDataset"

    obs_tree = gen_directory_tree_str(ge_dir)

    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(
        r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", obs_tree
    )
    # print(guid_safe_obs_tree)
    assert (
        guid_safe_obs_tree
        == """\
great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
    expectations/
        warning.json
    notebooks/
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    warning/
                        20190926T134241.000000Z/
                            20190926T134241.000000Z/
                                foobarbazguid.html
        validations/
            warning/
                20190926T134241.000000Z/
                    20190926T134241.000000Z/
                        foobarbazguid.json
"""
    )

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )
コード例 #4
0
def test_checkpoint_new_happy_path_generates_checkpoint_yml_with_comments(
        mock_emit, caplog, titanic_data_context_stats_enabled,
        titanic_expectation_suite):
    context = titanic_data_context_stats_enabled
    root_dir = context.root_directory
    assert context.list_checkpoints() == []
    context.save_expectation_suite(titanic_expectation_suite)
    assert context.list_expectation_suite_names() == ["Titanic.warning"]
    mock_emit.reset_mock()

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        f"checkpoint new passengers Titanic.warning -d {root_dir}",
        input="1\n1\n",
        catch_exceptions=False,
    )
    stdout = result.stdout
    assert result.exit_code == 0
    assert "A checkpoint named `passengers` was added to your project" in stdout

    assert mock_emit.call_count == 2
    assert mock_emit.call_args_list == [
        mock.call({
            "event_payload": {},
            "event": "data_context.__init__",
            "success": True
        }),
        mock.call({
            "event": "cli.checkpoint.new",
            "event_payload": {},
            "success": True
        }),
    ]
    expected_checkpoint = os.path.join(root_dir, context.CHECKPOINTS_DIR,
                                       "passengers.yml")
    assert os.path.isfile(expected_checkpoint)

    # Newup a context for additional assertions
    context = DataContext(root_dir)
    assert context.list_checkpoints() == ["passengers"]

    with open(expected_checkpoint, "r") as f:
        obs_file = f.read()

    # This is snapshot-ish to prove that comments remain in place
    assert ("""\
# This checkpoint was created by the command `great_expectations checkpoint new`.
#
# A checkpoint is a list of one or more batches paired with one or more
# Expectation Suites and a configurable Validation Operator.
#
# It can be run with the `great_expectations checkpoint run` command.
# You can edit this file to add batches of data and expectation suites.
#
# For more details please see
# https://docs.greatexpectations.io/en/latest/how_to_guides/validation/how_to_add_validations_data_or_suites_to_a_checkpoint.html
validation_operator_name: action_list_operator
# Batches are a list of batch_kwargs paired with a list of one or more suite
# names. A checkpoint can have one or more batches. This makes deploying
# Great Expectations in your pipelines easy!
batches:
  - batch_kwargs:""" in obs_file)

    assert "/data/Titanic.csv" in obs_file

    assert ("""datasource: mydatasource
      data_asset_name: Titanic
    expectation_suite_names: # one or more suites may validate against a single batch
      - Titanic.warning
""" in obs_file)

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #5
0
ファイル: init.py プロジェクト: thejasraju/great_expectations
def init(ctx, view, usage_stats):
    """
    Initialize a new Great Expectations project.

    This guided input walks the user through setting up a new project and also
    onboards a new developer in an existing project.

    It scaffolds directories, sets up notebooks, creates a project file, and
    appends to a `.gitignore` file.
    """
    display_not_implemented_message_and_exit()
    directory = toolkit.parse_cli_config_file_location(
        config_file_location=ctx.obj.config_file_location).get("directory")
    target_directory = os.path.abspath(directory)
    ge_dir = _get_full_path_to_ge_dir(target_directory)
    cli_message(GREETING)

    if DataContext.does_config_exist_on_disk(ge_dir):
        try:
            if DataContext.is_project_initialized(ge_dir):
                # Ensure the context can be instantiated
                cli_message(PROJECT_IS_COMPLETE)
        except (DataContextError, DatasourceInitializationError) as e:
            cli_message("<red>{}</red>".format(e.message))
            sys.exit(1)

        try:
            context = DataContext.create(target_directory,
                                         usage_statistics_enabled=usage_stats)
            cli_message(ONBOARDING_COMPLETE)
            # TODO if this is correct, ensure this is covered by a test
            # cli_message(SETUP_SUCCESS)
            # exit(0)
        except DataContextError as e:
            cli_message("<red>{}</red>".format(e.message))
            # TODO ensure this is covered by a test
            exit(5)
    else:
        if not click.confirm(LETS_BEGIN_PROMPT, default=True):
            cli_message(RUN_INIT_AGAIN)
            # TODO ensure this is covered by a test
            exit(0)

        try:
            context = DataContext.create(target_directory,
                                         usage_statistics_enabled=usage_stats)
            toolkit.send_usage_message(data_context=context,
                                       event="cli.init.create",
                                       success=True)
        except DataContextError as e:
            # TODO ensure this is covered by a test
            cli_message("<red>{}</red>".format(e))

    try:
        # if expectations exist, offer to build docs
        context = DataContext(ge_dir)
        if context.list_expectation_suites():
            if click.confirm(BUILD_DOCS_PROMPT, default=True):
                build_docs(context, view=view)

        else:
            datasources = context.list_datasources()
            if len(datasources) == 0:
                cli_message(SECTION_SEPARATOR)
                if not click.confirm(
                        "Would you like to configure a Datasource?",
                        default=True):
                    cli_message("Okay, bye!")
                    sys.exit(1)
                datasource_name, data_source_type = add_datasource_impl(
                    context, choose_one_data_asset=False)
                if not datasource_name:  # no datasource was created
                    sys.exit(1)

            datasources = context.list_datasources()
            if len(datasources) == 1:
                datasource_name = datasources[0]["name"]

                cli_message(SECTION_SEPARATOR)
                if not click.confirm(
                        "Would you like to profile new Expectations for a single data asset within your new Datasource?",
                        default=True,
                ):
                    cli_message(
                        "Okay, exiting now. To learn more about Profilers, run great_expectations profile --help or visit docs.greatexpectations.io!"
                    )
                    sys.exit(1)

                (
                    success,
                    suite_name,
                    profiling_results,
                ) = toolkit.create_expectation_suite(
                    context,
                    datasource_name=datasource_name,
                    additional_batch_kwargs={"limit": 1000},
                    flag_build_docs=False,
                    open_docs=False,
                )

                cli_message(SECTION_SEPARATOR)
                if not click.confirm("Would you like to build Data Docs?",
                                     default=True):
                    cli_message(
                        "Okay, exiting now. To learn more about Data Docs, run great_expectations docs --help or visit docs.greatexpectations.io!"
                    )
                    sys.exit(1)

                build_docs(context, view=False)

                if not click.confirm(
                        "\nWould you like to view your new Expectations in Data Docs? This will open a new browser window.",
                        default=True,
                ):
                    cli_message(
                        "Okay, exiting now. You can view the site that has been created in a browser, or visit docs.greatexpectations.io for more information!"
                    )
                    sys.exit(1)
                toolkit.attempt_to_open_validation_results_in_data_docs(
                    context, profiling_results)

                cli_message(SECTION_SEPARATOR)
                cli_message(SETUP_SUCCESS)
                sys.exit(0)
    except (
            DataContextError,
            ge_exceptions.ProfilerError,
            OSError,
            SQLAlchemyError,
    ) as e:
        cli_message("<red>{}</red>".format(e))
        sys.exit(1)
コード例 #6
0
def test_cli_datasource_profile_with_additional_batch_kwargs(
    caplog, empty_data_context, filesystem_csv_2
):
    empty_data_context.add_datasource(
        "my_datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    not_so_empty_data_context = empty_data_context

    project_root_dir = not_so_empty_data_context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            "-d",
            project_root_dir,
            "--additional-batch-kwargs",
            '{"reader_options": {"sep": ",", "parse_dates": [0]}}',
            "--no-view",
        ],
        input="Y\n",
        catch_exceptions=False,
    )
    stdout = result.output
    assert result.exit_code == 0

    assert (
        "Profiling 'my_datasource' will create expectations and documentation."
        in stdout
    )
    assert "Would you like to profile 'my_datasource'" in stdout
    assert (
        "Great Expectations is building Data Docs from the data you just profiled!"
        in stdout
    )

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    expected_suite_name = "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    assert suites[0].expectation_suite_name == expected_suite_name

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    assert validation.meta["expectation_suite_name"] == expected_suite_name
    assert validation.success is False
    assert len(validation.results) == 9

    batch_id = validation_keys[0].batch_identifier
    evr = context.get_validation_result(
        expectation_suite_name=expected_suite_name, batch_identifier=batch_id
    )
    reader_options = evr.meta["batch_kwargs"]["reader_options"]
    assert reader_options["parse_dates"] == [0]
    assert reader_options["sep"] == ","

    assert "Preparing column 1 of 1" in caplog.messages[0]
    assert len(caplog.messages) == 1
    assert_no_tracebacks(result)
コード例 #7
0
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one(
    mock_webbrowser,
    capsys,
    caplog,
    initialized_project,
):
    project_dir = initialized_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    # mangle the project to remove all traces of a suite and validations
    _remove_all_datasources(ge_dir)
    os.remove(os.path.join(ge_dir, "expectations", "Titanic", "warning.json"))
    uncommitted_dir = os.path.join(ge_dir, "uncommitted")
    validations_dir = os.path.join(ge_dir, uncommitted_dir, "validations")
    shutil.rmtree(validations_dir)
    os.mkdir(validations_dir)
    shutil.rmtree(os.path.join(uncommitted_dir, "data_docs", "local_site"))
    context = DataContext(ge_dir)
    assert not context.list_expectation_suites()

    data_folder_path = os.path.join(project_dir, "data")
    csv_path = os.path.join(project_dir, "data", "Titanic.csv")
    runner = CliRunner(mix_stderr=False)
    with pytest.warns(
            UserWarning,
            match="Warning. An existing `great_expectations.yml` was found"):
        result = runner.invoke(
            cli,
            ["init", "-d", project_dir],
            input="\n1\n1\n{}\n\n\n\n2\n{}\nmy_suite\n\n\n\n\n".format(
                data_folder_path, csv_path),
            catch_exceptions=False,
        )
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/"
        .format(project_dir) in mock_webbrowser.call_args[0][0])
    stdout = result.stdout

    assert result.exit_code == 0

    assert "Error: invalid input" not in stdout
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert (
        "Enter the path of a data file (relative or absolute, s3a:// and gs:// paths are ok too)"
        in stdout)
    assert "Name the new Expectation Suite [Titanic.warning]:" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        in stdout)
    assert "Great Expectations is now set up." in stdout

    config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML))
    assert "data__dir" in config["datasources"].keys()

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources()[0]["name"] == "data__dir"
    assert context.list_datasources()[0]["class_name"] == "PandasDatasource"
    assert context.list_expectation_suites(
    )[0].expectation_suite_name == "my_suite"
    assert len(context.list_expectation_suites()) == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #8
0
def test_suite_demo_answer_suite_name_prompts_with_name_of_existing_suite(
        mock_webbrowser, mock_subprocess, caplog, data_context,
        filesystem_csv_2):
    """
    We call the "suite demo" command without the suite name argument

    The command should:

    - prompt us to enter the name of the expectation suite that will be
    created. We answer the prompt with the name of an existing expectation suite.
    - display an error message and let us retry until we answer
    with a name that is not "taken".
    - create an example suite
    - NOT open jupyter
    - open DataDocs to the new example suite page
    """
    not_so_empty_data_context = data_context
    root_dir = not_so_empty_data_context.root_directory
    os.mkdir(os.path.join(root_dir, "uncommitted"))

    runner = CliRunner(mix_stderr=False)
    csv_path = os.path.join(filesystem_csv_2, "f1.csv")

    existing_suite_name = "my_dag_node.default"
    context = DataContext(root_dir)
    assert context.list_expectation_suite_names() == [existing_suite_name]

    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir],
        input=f"{csv_path}\n{existing_suite_name}\nmy_new_suite\n\n",
        catch_exceptions=False,
    )
    stdout = result.stdout
    assert result.exit_code == 0
    assert (
        f"An expectation suite named `{existing_suite_name}` already exists."
        in stdout)
    assert (
        f"If you intend to edit the suite please use `great_expectations suite edit {existing_suite_name}`"
        in stdout)
    assert "Enter the path" in stdout
    assert "Name the new expectation suite [f1.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "The following Data Docs sites were built" in stdout
    assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout
    assert "open a notebook for you now" not in stdout

    expected_suite_path = os.path.join(root_dir, "expectations",
                                       "my_new_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_subprocess.call_count == 0
    assert mock_webbrowser.call_count == 1

    foo = os.path.join(
        root_dir, "uncommitted/data_docs/local_site/validations/my_new_suite/")
    assert f"file://{foo}" in mock_webbrowser.call_args[0][0]

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #9
0
def test_suite_new_empty_with_no_jupyter(mock_webbroser, mock_subprocess,
                                         caplog, data_context,
                                         filesystem_csv_2):
    """
    Running "suite new --no-jupyter" should:
    - make an empty suite
    - NOT open jupyter
    - NOT open data docs
    """
    os.mkdir(os.path.join(data_context.root_directory, "uncommitted"))
    root_dir = data_context.root_directory
    runner = CliRunner(mix_stderr=False)
    csv = os.path.join(filesystem_csv_2, "f1.csv")
    # TODO this test must be updated to remove the --empty flag in the next major release
    result = runner.invoke(
        cli,
        [
            "suite", "new", "-d", root_dir, "--empty", "--suite", "foo",
            "--no-jupyter"
        ],
        input=f"{csv}\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Enter the path" in stdout
    assert "Name the new expectation suite" not in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        not in stdout)
    assert "Generating example Expectation Suite..." not in stdout
    assert "The following Data Docs sites were built" not in stdout
    assert "A new Expectation suite 'foo' was added to your project" in stdout
    assert "open a notebook for you now" not in stdout

    expected_suite_path = os.path.join(root_dir, "expectations", "foo.json")
    assert os.path.isfile(expected_suite_path)

    expected_notebook = os.path.join(root_dir, "uncommitted", "edit_foo.ipynb")
    assert os.path.isfile(expected_notebook)

    context = DataContext(root_dir)
    assert "foo" in context.list_expectation_suite_names()
    suite = context.get_expectation_suite("foo")
    assert suite.expectations == []
    citations = suite.get_citations()
    citations[0].pop("citation_date")
    assert citations[0] == {
        "batch_kwargs": {
            "datasource": "mydatasource",
            "path": csv,
            "reader_method": "read_csv",
        },
        "batch_markers": None,
        "batch_parameters": None,
        "comment": "New suite added via CLI",
    }

    assert mock_subprocess.call_count == 0
    assert mock_webbroser.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #10
0
def test_notebook_execution_with_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    """
    To set this test up we:

    - create a suite using profiling
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=None,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    suite["meta"].pop("citations", None)
    assert suite.expectations == [
        ExpectationConfiguration(
            **{
                "expectation_type":
                "expect_table_columns_to_match_ordered_list",
                "kwargs": {
                    "column_list": [
                        "Unnamed: 0",
                        "Name",
                        "PClass",
                        "Age",
                        "Sex",
                        "Survived",
                        "SexCode",
                    ]
                },
                "meta": {},
            }),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_table_row_count_to_be_between",
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "meta": {},
            }),
    ]

    columns_with_expectations: Set[str]
    expectations_from_suite: Set[str]
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite=suite)

    expected_expectations: Set[str] = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations
コード例 #11
0
def test_cli_init_on_existing_project_with_no_uncommitted_dirs_answering_yes_to_fixing_them(
    mock_webbrowser,
    caplog,
    monkeypatch,
    tmp_path_factory,
):
    """
    This test walks through the onboarding experience.

    The user just checked an existing project out of source control and does
    not yet have an uncommitted directory.
    """
    root_dir = tmp_path_factory.mktemp("hiya")
    root_dir = str(root_dir)
    os.makedirs(os.path.join(root_dir, "data"))
    data_folder_path = os.path.join(root_dir, "data")
    data_path = os.path.join(root_dir, "data", "Titanic.csv")
    fixture_path = file_relative_path(
        __file__, os.path.join("..", "test_sets", "Titanic.csv"))
    shutil.copy(fixture_path, data_path)

    # Create a new project from scratch that we will use for the test in the next step

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(root_dir)
    result = runner.invoke(
        cli,
        ["--v3-api", "init"],
        input=f"\n\n1\n1\n{data_folder_path}\n\n\n\n2\n{data_path}\n\n\n\n",
        catch_exceptions=False,
    )
    stdout = result.output
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/"
        .format(root_dir) in mock_webbrowser.call_args[0][0])

    assert "Great Expectations is now set up." in stdout

    context = DataContext(os.path.join(root_dir, DataContext.GE_DIR))
    uncommitted_dir = os.path.join(context.root_directory, "uncommitted")
    shutil.rmtree(uncommitted_dir)
    assert not os.path.isdir(uncommitted_dir)

    # Test the second invocation of init
    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    with pytest.warns(
            UserWarning,
            match="Warning. An existing `great_expectations.yml` was found"):
        result = runner.invoke(
            cli,
            ["--v3-api", "init"],
            input="Y\nn\n",
            catch_exceptions=False,
        )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Great Expectations added some missing files required to run." in stdout
    assert "You may see new files in" in stdout

    assert "OK. You must run" not in stdout
    assert "great_expectations init" not in stdout
    assert "to fix the missing files!" not in stdout
    assert "Would you like to build & view this project's Data Docs!?" in stdout

    assert os.path.isdir(uncommitted_dir)
    config_var_path = os.path.join(uncommitted_dir, "config_variables.yml")
    assert os.path.isfile(config_var_path)
    with open(config_var_path) as f:
        assert f.read() == CONFIG_VARIABLES_TEMPLATE

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #12
0
def upgrade_project(context_root_dir,
                    ge_config_version,
                    from_cli_upgrade_command=False):
    continuation_message = (
        "\nOk, exiting now. To upgrade at a later time, use the following command: "
        "<cyan>great_expectations project upgrade</cyan>\n\nTo learn more about the upgrade "
        "process, visit "
        "<cyan>https://docs.greatexpectations.io/en/latest/how_to_guides/migrating_versions.html"
        "</cyan>.\n")
    if from_cli_upgrade_command:
        message = (
            f"<red>\nYour project appears to have an out-of-date config version ({ge_config_version}) - "
            f"the version "
            f"number must be at least {MINIMUM_SUPPORTED_CONFIG_VERSION}.</red>"
        )
    else:
        message = (
            f"<red>\nYour project appears to have an out-of-date config version ({ge_config_version}) - "
            f"the version "
            f"number must be at least {MINIMUM_SUPPORTED_CONFIG_VERSION}.\nIn order to proceed, "
            f"your project must be upgraded.</red>")

    cli_message(message)
    upgrade_prompt = (
        "\nWould you like to run the Upgrade Helper to bring your project up-to-date?"
    )
    confirm_proceed_or_exit(confirm_prompt=upgrade_prompt,
                            continuation_message=continuation_message)
    cli_message(SECTION_SEPARATOR)

    # use loop in case multiple upgrades need to take place
    while ge_config_version < MINIMUM_SUPPORTED_CONFIG_VERSION:
        upgrade_helper_class = GE_UPGRADE_HELPER_VERSION_MAP.get(
            int(ge_config_version))
        if not upgrade_helper_class:
            break
        target_ge_config_version = int(ge_config_version) + 1
        # set version temporarily to MINIMUM_SUPPORTED_CONFIG_VERSION to get functional DataContext
        DataContext.set_ge_config_version(
            config_version=MINIMUM_SUPPORTED_CONFIG_VERSION,
            context_root_dir=context_root_dir,
        )
        upgrade_helper = upgrade_helper_class(
            context_root_dir=context_root_dir)
        upgrade_overview, confirmation_required = upgrade_helper.get_upgrade_overview(
        )

        if confirmation_required:
            upgrade_confirmed = confirm_proceed_or_exit(
                confirm_prompt=upgrade_overview,
                continuation_message=continuation_message,
                exit_on_no=False,
            )
        else:
            upgrade_confirmed = True

        if upgrade_confirmed:
            cli_message("\nUpgrading project...")
            cli_message(SECTION_SEPARATOR)
            # run upgrade and get report of what was done, if version number should be incremented
            upgrade_report, increment_version = upgrade_helper.upgrade_project(
            )
            # display report to user
            cli_message(upgrade_report)
            # set config version to target version
            if increment_version:
                DataContext.set_ge_config_version(
                    target_ge_config_version,
                    context_root_dir,
                    validate_config_version=False,
                )
                ge_config_version += 1
            else:
                # restore version number to current number
                DataContext.set_ge_config_version(
                    ge_config_version,
                    context_root_dir,
                    validate_config_version=False)
                break
        else:
            # restore version number to current number
            DataContext.set_ge_config_version(ge_config_version,
                                              context_root_dir,
                                              validate_config_version=False)
            cli_message(continuation_message)
            sys.exit(0)

    cli_message(SECTION_SEPARATOR)
    upgrade_success_message = "<green>Upgrade complete. Exiting...</green>\n"
    upgrade_incomplete_message = f"""\
<red>The Upgrade Helper was unable to perform a complete project upgrade. Next steps:</red>

    - Please perform any manual steps outlined in the Upgrade Overview and/or Upgrade Report above
    - When complete, increment the config_version key in your <cyan>great_expectations.yml</cyan> to <cyan>{
    ge_config_version + 1}</cyan>\n
To learn more about the upgrade process, visit \
<cyan>https://docs.greatexpectations.io/en/latest/how_to_guides/migrating_versions.html</cyan>
"""

    if ge_config_version < MINIMUM_SUPPORTED_CONFIG_VERSION:
        cli_message(upgrade_incomplete_message)
    else:
        cli_message(upgrade_success_message)
    sys.exit(0)
コード例 #13
0
def titanic_spark_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled(
    tmp_path_factory,
    monkeypatch,
    spark_session,
):
    # Re-enable GE_USAGE_STATS
    monkeypatch.delenv("GE_USAGE_STATS")

    project_path: str = str(tmp_path_factory.mktemp("titanic_data_context"))
    context_path: str = os.path.join(project_path, "great_expectations")
    os.makedirs(os.path.join(context_path, "expectations"), exist_ok=True)
    data_path: str = os.path.join(context_path, "..", "data", "titanic")
    os.makedirs(os.path.join(data_path), exist_ok=True)
    shutil.copy(
        file_relative_path(
            __file__,
            os.path.join(
                "..",
                "test_fixtures",
                "great_expectations_v013_no_datasource_stats_enabled.yml",
            ),
        ),
        str(os.path.join(context_path, "great_expectations.yml")),
    )
    shutil.copy(
        file_relative_path(__file__,
                           os.path.join("..", "test_sets", "Titanic.csv")),
        str(
            os.path.join(context_path, "..", "data", "titanic",
                         "Titanic_19120414_1313.csv")),
    )
    shutil.copy(
        file_relative_path(__file__,
                           os.path.join("..", "test_sets", "Titanic.csv")),
        str(
            os.path.join(context_path, "..", "data", "titanic",
                         "Titanic_19120414_1313")),
    )
    shutil.copy(
        file_relative_path(__file__,
                           os.path.join("..", "test_sets", "Titanic.csv")),
        str(
            os.path.join(context_path, "..", "data", "titanic",
                         "Titanic_1911.csv")),
    )
    shutil.copy(
        file_relative_path(__file__,
                           os.path.join("..", "test_sets", "Titanic.csv")),
        str(
            os.path.join(context_path, "..", "data", "titanic",
                         "Titanic_1912.csv")),
    )

    context: DataContext = DataContext(context_root_dir=context_path)
    assert context.root_directory == context_path

    datasource_config: str = f"""
        class_name: Datasource

        execution_engine:
            class_name: SparkDFExecutionEngine

        data_connectors:
            my_basic_data_connector:
                class_name: InferredAssetFilesystemDataConnector
                base_directory: {data_path}
                default_regex:
                    pattern: (.*)\\.csv
                    group_names:
                        - data_asset_name

            my_special_data_connector:
                class_name: ConfiguredAssetFilesystemDataConnector
                base_directory: {data_path}
                glob_directive: "*.csv"

                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                        - name
                assets:
                    users:
                        base_directory: {data_path}
                        pattern: (.+)_(\\d+)_(\\d+)\\.csv
                        group_names:
                            - name
                            - timestamp
                            - size

            my_other_data_connector:
                class_name: ConfiguredAssetFilesystemDataConnector
                base_directory: {data_path}
                glob_directive: "*.csv"

                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                        - name
                assets:
                    users: {{}}

            my_runtime_data_connector:
                module_name: great_expectations.datasource.data_connector
                class_name: RuntimeDataConnector
                batch_identifiers:
                    - pipeline_stage_name
                    - airflow_run_id
    """

    # noinspection PyUnusedLocal
    context.test_yaml_config(name="my_datasource",
                             yaml_config=datasource_config,
                             pretty_print=False)
    # noinspection PyProtectedMember
    context._save_project_config()
    return context
コード例 #14
0
def test_cli_datasource_profile_with_data_asset_and_additional_batch_kwargs_with_limit(
        empty_data_context, titanic_sqlite_db, caplog):
    """
    User can pass additional batch kwargs (e.g., limit) to a sql backend.
    Here we are verifying that passing "limit" affects the query correctly -
    the row count in the batch that the profiler uses to profile the data asset
    must match the limit passed by the user.
    """
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    datasource_name = "wow_a_datasource"
    context = _add_datasource_and_credentials_to_context(
        context, datasource_name, titanic_sqlite_db)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            "-d",
            project_root_dir,
            "--data-assets",
            "main.titanic",
            "--additional-batch-kwargs",
            '{"limit": 97}',
            "--no-view",
        ],
        input="Y\n",
        catch_exceptions=False,
    )

    stdout = result.stdout
    assert result.exit_code == 0
    assert "Profiling '{}'".format(datasource_name) in stdout
    assert "The following Data Docs sites will be built:\n" in stdout
    assert "local_site:" in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (suites[0].expectation_suite_name ==
            "wow_a_datasource.default.main.titanic.BasicDatasetProfiler")

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    assert (validation.meta["expectation_suite_name"] ==
            "wow_a_datasource.default.main.titanic.BasicDatasetProfiler")
    assert validation.success is False

    row_count_validation_results = [
        validation_result for validation_result in validation.results
        if validation_result.expectation_config.expectation_type ==
        "expect_table_row_count_to_be_between"
    ]
    assert len(row_count_validation_results) == 1
    assert row_count_validation_results[0].result["observed_value"] == 97

    assert "Preparing column 1 of 7" in caplog.messages[0]
    assert len(caplog.messages) == 10
    assert_no_tracebacks(result)
コード例 #15
0
def test_cli_datasource_list(caplog, empty_data_context, filesystem_csv_2):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False
    )

    stdout = result.stdout.strip()
    assert "No Datasources found" in stdout
    assert context.list_datasources() == []

    base_directory = str(filesystem_csv_2)

    context.add_datasource(
        "wow_a_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": base_directory,
            }
        },
    )

    datasources = context.list_datasources()

    assert datasources == [
        {
            "name": "wow_a_datasource",
            "class_name": "PandasDatasource",
            "data_asset_type": {
                "class_name": "PandasDataset",
                "module_name": "great_expectations.dataset",
            },
            "batch_kwargs_generators": {
                "subdir_reader": {
                    "base_directory": base_directory,
                    "class_name": "SubdirReaderBatchKwargsGenerator",
                }
            },
            "module_name": "great_expectations.datasource",
        }
    ]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False
    )
    expected_output = """
1 Datasource found:

 - name: wow_a_datasource
   module_name: great_expectations.datasource
   class_name: PandasDatasource
   batch_kwargs_generators:
     subdir_reader:
       class_name: SubdirReaderBatchKwargsGenerator
       base_directory: {}
   data_asset_type:
     module_name: great_expectations.dataset
     class_name: PandasDataset""".format(
        base_directory
    ).strip()
    stdout = result.stdout.strip()
    assert stdout == expected_output
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #16
0
def test_suite_demo_one_datasource_without_generator_without_suite_name_argument(
        mock_webbrowser, mock_subprocess, caplog, empty_data_context,
        filesystem_csv_2):
    """
    We call the "suite demo" command without the suite name argument

    The command should:

    - NOT prompt us to choose a datasource (because there is only one)
    - prompt us only to enter the path (The datasource has no generator
     configured and not to choose from the generator's list of available data
     assets).
    - We enter the path of the file we want the command to use as the batch to
    create the expectation suite.
    - prompt us to enter the name of the expectation suite that will be
    created
    - open Data Docs
    - NOT open jupyter
    """
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
    )

    context = empty_data_context
    root_dir = context.root_directory
    context = DataContext(root_dir)
    runner = CliRunner(mix_stderr=False)
    csv_path = os.path.join(filesystem_csv_2, "f1.csv")
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir],
        input=f"{csv_path}\nmy_new_suite\n\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Enter the path" in stdout
    assert "Name the new expectation suite [f1.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "The following Data Docs sites were built" in stdout
    assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout

    obs_urls = context.get_docs_sites_urls()

    assert len(obs_urls) == 1
    assert ("great_expectations/uncommitted/data_docs/local_site/index.html"
            in obs_urls[0]["site_url"])

    expected_index_path = os.path.join(root_dir, "uncommitted", "data_docs",
                                       "local_site", "index.html")
    assert os.path.isfile(expected_index_path)

    expected_suite_path = os.path.join(root_dir, "expectations",
                                       "my_new_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 1
    assert mock_subprocess.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #17
0
def test_cli_datasource_profile_with_skip_prompt_flag(
    caplog, empty_data_context, filesystem_csv_2
):
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    not_so_empty_data_context = empty_data_context

    project_root_dir = not_so_empty_data_context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["datasource", "profile", "-d", project_root_dir, "--no-view", "-y"],
        input="Y\n",
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    stdout = result.stdout

    assert (
        "Profiling 'my_datasource' will create expectations and documentation."
        in stdout
    )
    assert "Would you like to profile 'my_datasource'" not in stdout
    assert (
        "Great Expectations is building Data Docs from the data you just profiled!"
        in stdout
    )

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (
        suites[0].expectation_suite_name
        == "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    )

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    assert (
        validation.meta["expectation_suite_name"]
        == "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    )
    assert validation.success is False
    assert len(validation.results) == 8

    assert "Preparing column 1 of 1" in caplog.messages[0]
    assert len(caplog.messages) == 1
    assert_no_tracebacks(result)
コード例 #18
0
def test_suite_demo_multiple_datasources_with_generator_without_suite_name_argument(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    site_builder_data_context_with_html_store_titanic_random,
):
    """
    We call the "suite demo" command without the suite name argument

    - The data context has two datasources - we choose one of them.
    - It has a generator configured. We choose to use the generator and select a
    generator asset from the list.
    - The command should prompt us to enter the name of the expectation suite
    that will be created.
    - open Data Docs
    - NOT open jupyter
    """
    root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory
    os.chdir(root_dir)
    context = DataContext(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir],
        input="1\n1\n1\nmy_new_suite\n\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert ("""Select a datasource
    1. random
    2. titanic""" in stdout)
    assert ("""Which data would you like to use?
    1. f1 (file)
    2. f2 (file)""" in stdout)
    assert "Name the new expectation suite [f1.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "The following Data Docs sites were built" in stdout
    assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout

    obs_urls = context.get_docs_sites_urls()

    assert len(obs_urls) == 1
    assert ("great_expectations/uncommitted/data_docs/local_site/index.html"
            in obs_urls[0]["site_url"])

    expected_index_path = os.path.join(root_dir, "uncommitted", "data_docs",
                                       "local_site", "index.html")
    assert os.path.isfile(expected_index_path)

    expected_suite_path = os.path.join(root_dir, "expectations",
                                       "my_new_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 1
    assert mock_subprocess.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #19
0
def test_cli_datasource_profile_with_valid_data_asset_arg(
    caplog, empty_data_context, filesystem_csv_2
):
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )
    context = empty_data_context

    project_root_dir = context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            "my_datasource",
            "--data-assets",
            "f1",
            "-d",
            project_root_dir,
            "--no-view",
        ],
        input="\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Profiling 'my_datasource'" in stdout
    assert "The following Data Docs sites will be built:\n" in stdout
    assert "local_site:" in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (
        suites[0].expectation_suite_name
        == "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    )

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    suite_name = validation.meta["expectation_suite_name"]
    assert suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    assert validation.success is False
    assert len(validation.results) == 8

    assert "Preparing column 1 of 1" in caplog.messages[0]
    assert len(caplog.messages) == 1
    assert_no_tracebacks(result)
コード例 #20
0
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_without_citations(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    site_builder_data_context_with_html_store_titanic_random,
):
    """
    Here we verify that the "suite edit" command helps the user to specify the batch
    kwargs when it is called without the optional arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our test
    will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments. This means that
    the command will help us specify the batch kwargs interactively.

    The data context has two datasources - we choose one of them. It has a generator
    configured. We choose to use the generator and select a generator asset from the list.

    The command should:
    - NOT open Data Docs
    - open jupyter
    """
    root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory
    os.chdir(root_dir)
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["suite", "demo", "-d", root_dir, "--suite", "foo_suite"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert mock_subprocess.call_count == 0
    mock_webbrowser.reset_mock()
    mock_subprocess.reset_mock()

    # remove the citations from the suite
    context = DataContext(root_dir)
    suite = context.get_expectation_suite("foo_suite")
    assert isinstance(suite, ExpectationSuite)
    suite.meta.pop("citations")
    context.save_expectation_suite(suite)

    # Actual testing really starts here
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "suite",
            "edit",
            "foo_suite",
            "-d",
            root_dir,
        ],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "A batch of data is required to edit the suite" in stdout
    assert "Select a datasource" in stdout
    assert "Which data would you like to use" in stdout

    expected_notebook_path = os.path.join(root_dir, "uncommitted",
                                          "edit_foo_suite.ipynb")
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations",
                                       "foo_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #21
0
def test_cli_datasource_new(mock_subprocess, caplog, monkeypatch,
                            empty_data_context, filesystem_csv_2):
    context = empty_data_context
    root_dir = context.root_directory
    assert context.list_datasources() == []

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(root_dir))
    result = runner.invoke(
        cli,
        "--v3-api datasource new",
        input=f"1\n1\n{filesystem_csv_2}\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert context.list_datasources() == []

    assert "What data would you like Great Expectations to connect to?" in stdout
    assert "What are you processing your files with?" in stdout

    assert result.exit_code == 0

    uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR)
    expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb")
    assert os.path.isfile(expected_notebook)
    mock_subprocess.assert_called_once_with(
        ["jupyter", "notebook", expected_notebook])

    # Run notebook
    with open(expected_notebook) as f:
        nb = nbformat.read(f, as_version=4)
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    del context
    context = DataContext(root_dir)

    assert len(context.list_datasources()) == 1

    assert context.list_datasources() == [{
        "name": "my_datasource",
        "class_name": "Datasource",
        "module_name": "great_expectations.datasource",
        "execution_engine": {
            "module_name": "great_expectations.execution_engine",
            "class_name": "PandasExecutionEngine",
        },
        "data_connectors": {
            "my_datasource_example_data_connector": {
                "default_regex": {
                    "group_names": "data_asset_name",
                    "pattern": "(.*)",
                },
                "module_name": "great_expectations.datasource.data_connector",
                "base_directory": "../../filesystem_csv_2",
                "class_name": "InferredAssetFilesystemDataConnector",
            }
        },
    }]
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #22
0
def test_cli_datasource_new_connection_string(
    mock_subprocess, mock_emit, empty_data_context, empty_sqlite_db, caplog, monkeypatch
):
    monkeypatch.delenv(
        "GE_USAGE_STATS", raising=False
    )  # Undo the project-wide test default
    root_dir = empty_data_context.root_directory
    context: DataContext = empty_data_context
    assert context.list_datasources() == []

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource new",
        input="2\n6\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert "What data would you like Great Expectations to connect to?" in stdout

    assert result.exit_code == 0

    uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR)
    expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb")

    assert os.path.isfile(expected_notebook)
    mock_subprocess.assert_called_once_with(["jupyter", "notebook", expected_notebook])

    expected_call_args_list = [
        mock.call(
            {"event_payload": {}, "event": "data_context.__init__", "success": True}
        ),
        mock.call(
            {
                "event": "cli.datasource.new.begin",
                "event_payload": {"api_version": "v3"},
                "success": True,
            }
        ),
        mock.call(
            {
                "event": "cli.new_ds_choice",
                "event_payload": {
                    "type": "sqlalchemy",
                    "db": "other",
                    "api_version": "v3",
                },
                "success": True,
            }
        ),
        mock.call(
            {
                "event": "cli.datasource.new.end",
                "event_payload": {"api_version": "v3"},
                "success": True,
            }
        ),
    ]

    assert mock_emit.call_args_list == expected_call_args_list
    assert mock_emit.call_count == len(expected_call_args_list)

    # Run notebook
    with open(expected_notebook) as f:
        nb = nbformat.read(f, as_version=4)

    # mock the user adding a connection string into the notebook by overwriting the right cell

    assert "connection_string" in nb["cells"][5]["source"]
    nb["cells"][5]["source"] = '  connection_string = "sqlite://"'
    ep = ExecutePreprocessor(timeout=60, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    del context
    context = DataContext(root_dir)

    assert context.list_datasources() == [
        {
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "connection_string": "sqlite://",
                "class_name": "SqlAlchemyExecutionEngine",
            },
            "class_name": "Datasource",
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "batch_identifiers": ["default_identifier_name"],
                    "class_name": "RuntimeDataConnector",
                    "module_name": "great_expectations.datasource.data_connector",
                },
                "default_inferred_data_connector_name": {
                    "class_name": "InferredAssetSqlDataConnector",
                    "module_name": "great_expectations.datasource.data_connector",
                    "include_schema_name": True,
                },
            },
            "name": "my_datasource",
        }
    ]

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #23
0
    def __init__(self, data_context=None, context_root_dir=None):
        assert (
            data_context or context_root_dir
        ), "Please provide a data_context object or a context_root_dir."

        self.data_context = data_context or DataContext(
            context_root_dir=context_root_dir
        )

        self.upgrade_log = {
            "skipped_validations_stores": {
                "database_store_backends": [],
                "unsupported": [],
            },
            "skipped_docs_validations_stores": {"unsupported": []},
            "skipped_metrics_stores": {
                "database_store_backends": [],
                "unsupported": [],
            },
            "exceptions": [
                # {
                #     "validation_store_name": store_name
                #     "src": src_url,
                #     "dest": dest_url,
                #     "exception_message": exception_message,
                # },
                # {
                #     "site_name": site_name,
                #     "src": src_url,
                #     "dest": dest_url,
                #     "exception_message": exception_message,
                # }
            ],
            "upgraded_validations_stores": {
                # STORE_NAME: {
                #     "validations_updated": [{
                #         "src": src_url,
                #         "dest": dest_url
                #     }],
                #     "exceptions": BOOL
                # }
            },
            "upgraded_docs_site_validations_stores": {
                # SITE_NAME: {
                #     "validation_result_pages_updated": [{
                #         src: src_url,
                #         dest: dest_url
                #     }],
                #     "exceptions": BOOL
                # }
            },
        }

        self.upgrade_checklist = {
            "validations_store_backends": {},
            "docs_validations_store_backends": {},
        }

        self.validation_run_times = {}

        self.run_time_setters_by_backend_type = {
            TupleFilesystemStoreBackend: self._get_tuple_filesystem_store_backend_run_time,
            TupleS3StoreBackend: self._get_tuple_s3_store_backend_run_time,
            TupleGCSStoreBackend: self._get_tuple_gcs_store_backend_run_time,
        }

        self._generate_upgrade_checklist()
コード例 #24
0
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_containing_citations(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    monkeypatch,
    site_builder_data_context_v013_with_html_store_titanic_random,
):
    """
    Here we verify that the "suite edit" command uses the batch kwargs found in
    citations in the existing suite when it is called without the optional
    arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our
    test will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments.

    The command should:
    - NOT open Data Docs
    - NOT open jupyter
    """
    context = site_builder_data_context_v013_with_html_store_titanic_random
    root_dir = context.root_directory
    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        [
            "--v3-api",
            "suite",
            "new",
            "--suite",
            "foo_suite",
            "--no-jupyter",
        ],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )
    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 0
    mock_subprocess.reset_mock()
    mock_webbrowser.reset_mock()
    assert result.exit_code == 0
    context = DataContext(root_dir)
    suite = context.get_expectation_suite("foo_suite")
    assert isinstance(suite, ExpectationSuite)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        ["--v3-api", "suite", "edit", "foo_suite"],
        input="2\n1\n1\n\n",
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Select a datasource" not in stdout
    assert "Which data would you like to use" not in stdout

    expected_notebook_path = os.path.join(
        root_dir, "uncommitted", "edit_foo_suite.ipynb"
    )
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations", "foo_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(
        my_caplog=caplog,
        click_result=result,
        allowed_deprecation_message=VALIDATION_OPERATORS_DEPRECATION_MESSAGE,
    )
コード例 #25
0
def validation_operator_run(name, run_name, validation_config_file, suite,
                            directory) -> None:
    # Note though the long lines here aren't pythonic, they look best if Click does the line wraps.
    """
    Run a validation operator against some data.

    There are two modes to run this command:

    1. Interactive (good for development):

        Specify the name of the validation operator using the --name argument
        and the name of the expectation suite using the --suite argument.

        The cli will help you specify the batch of data that you want to
        validate interactively.


    2. Non-interactive (good for production):

        Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch.

        Learn how to create a validation config file here:
        https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path

        This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1.

    To learn more about validation operators, go here:
    https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators
    """

    try:
        context = DataContext(directory)
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message(f"Failed to process <red>{err.message}</red>")
        sys.exit(1)

    try:
        if validation_config_file is not None:
            try:
                with open(validation_config_file) as f:
                    validation_config = json.load(f)
            except (OSError, json_parse_exception) as e:
                cli_message(
                    f"Failed to process the --validation_config_file argument: <red>{e}</red>"
                )
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(1)

            validation_config_error_message = _validate_valdiation_config(
                validation_config)
            if validation_config_error_message is not None:
                cli_message(
                    "<red>The validation config in {:s} is misconfigured: {:s}</red>"
                    .format(validation_config_file,
                            validation_config_error_message))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(1)

        else:
            if suite is None:
                cli_message("""
Please use --suite argument to specify the name of the expectation suite.
Call `great_expectation suite list` command to list the expectation suites in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(0)

            suite = toolkit.load_expectation_suite(
                context, suite, "cli.validation_operator.run")

            if name is None:
                cli_message("""
Please use --name argument to specify the name of the validation operator.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(1)
            else:
                if name not in context.list_validation_operator_names():
                    cli_message(f"""
Could not find a validation operator {name}.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                    send_usage_message(
                        data_context=context,
                        event="cli.validation_operator.run",
                        api_version="v2",
                        success=False,
                    )
                    sys.exit(1)

            batch_kwargs = None

            cli_message("""
Let us help you specify the batch of data your want the validation operator to validate."""
                        )

            try:
                data_source = toolkit.select_datasource(context)
            except ValueError as ve:
                cli_message(f"<red>{ve}</red>")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(1)

            if not data_source:
                cli_message("<red>No datasources found in the context.</red>")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    api_version="v2",
                    success=False,
                )
                sys.exit(1)

            if batch_kwargs is None:
                (
                    datasource_name,
                    batch_kwargs_generator,
                    data_asset,
                    batch_kwargs,
                ) = get_batch_kwargs(
                    context,
                    datasource_name=data_source.name,
                    batch_kwargs_generator_name=None,
                    data_asset_name=None,
                    additional_batch_kwargs=None,
                )

            validation_config = {
                "validation_operator_name":
                name,
                "batches": [{
                    "batch_kwargs":
                    batch_kwargs,
                    "expectation_suite_names": [suite.expectation_suite_name],
                }],
            }

        try:
            validation_operator_name = validation_config[
                "validation_operator_name"]
            batches_to_validate = []
            for entry in validation_config["batches"]:
                for expectation_suite_name in entry["expectation_suite_names"]:
                    batch = context.get_batch(entry["batch_kwargs"],
                                              expectation_suite_name)
                    batches_to_validate.append(batch)

            if run_name is None:
                run_name = datetime.datetime.now(
                    datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")

            run_id = RunIdentifier(run_name=run_name)

            if suite is None:
                results = context.run_validation_operator(
                    validation_operator_name,
                    assets_to_validate=batches_to_validate,
                    run_id=run_id,
                )
            else:
                if suite.evaluation_parameters is None:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                    )
                else:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                        evaluation_parameters=suite.evaluation_parameters,
                    )
        except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e:
            cli_message(f"<red>{e}</red>")
            send_usage_message(
                data_context=context,
                event="cli.validation_operator.run",
                api_version="v2",
                success=False,
            )
            sys.exit(1)

        if not results["success"]:
            cli_message("Validation failed!")
            send_usage_message(
                data_context=context,
                event="cli.validation_operator.run",
                api_version="v2",
                success=True,
            )
            sys.exit(1)
        else:
            cli_message("Validation succeeded!")
            send_usage_message(
                data_context=context,
                event="cli.validation_operator.run",
                api_version="v2",
                success=True,
            )
            sys.exit(0)
    except Exception as e:
        send_usage_message(
            data_context=context,
            event="cli.validation_operator.run",
            api_version="v2",
            success=False,
        )
        raise e
コード例 #26
0
def test_suite_new_creates_empty_suite(
    mock_webbroser,
    mock_subprocess,
    caplog,
    monkeypatch,
    data_context_parameterized_expectation_suite,
    filesystem_csv_2,
):
    """
    Running "suite new" should:
    - make an empty suite
    - open jupyter
    - NOT open data docs
    """
    context = data_context_parameterized_expectation_suite
    project_root_dir = context.root_directory
    os.mkdir(os.path.join(project_root_dir, "uncommitted"))
    root_dir = project_root_dir
    runner = CliRunner(mix_stderr=False)
    csv = os.path.join(filesystem_csv_2, "f1.csv")
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        ["--v3-api", "suite", "new", "--suite", "foo"],
        input=f"{csv}\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Enter the path" in stdout
    assert "Name the new expectation suite" not in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        not in stdout
    )
    assert "Generating example Expectation Suite..." not in stdout
    assert "The following Data Docs sites were built" not in stdout
    assert (
        "Great Expectations will create a new Expectation Suite 'foo' and store it here"
        in stdout
    )
    assert (
        "Because you requested an empty suite, we'll open a notebook for you now to edit it!"
        in stdout
    )

    expected_suite_path = os.path.join(root_dir, "expectations", "foo.json")
    assert os.path.isfile(expected_suite_path)

    expected_notebook = os.path.join(root_dir, "uncommitted", "edit_foo.ipynb")
    assert os.path.isfile(expected_notebook)

    context = DataContext(root_dir)
    assert "foo" in context.list_expectation_suite_names()
    suite = context.get_expectation_suite("foo")
    assert suite.expectations == []
    citations = suite.get_citations()
    citations[0].pop("citation_date")
    assert citations[0] == {
        "batch_kwargs": {
            "data_asset_name": "f1",
            "datasource": "mydatasource",
            "path": csv,
            "reader_method": "read_csv",
        },
        "batch_markers": None,
        "batch_parameters": None,
        "comment": "New suite added via CLI",
    }

    assert mock_subprocess.call_count == 1
    call_args = mock_subprocess.call_args[0][0]
    assert call_args[0] == "jupyter"
    assert call_args[1] == "notebook"
    assert expected_notebook in call_args[2]

    assert mock_webbroser.call_count == 0

    assert_no_logging_messages_or_tracebacks(
        my_caplog=caplog,
        click_result=result,
        allowed_deprecation_message=VALIDATION_OPERATORS_DEPRECATION_MESSAGE,
    )
コード例 #27
0
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one(
    mock_webbrowser, caplog, initialized_sqlite_project, titanic_sqlite_db_file,
):
    project_dir = initialized_sqlite_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    _remove_all_datasources(ge_dir)
    os.remove(os.path.join(ge_dir, "expectations", "warning.json"))
    context = DataContext(ge_dir)
    assert not context.list_expectation_suites()

    runner = CliRunner(mix_stderr=False)
    url = "sqlite:///{}".format(titanic_sqlite_db_file)
    with pytest.warns(
        UserWarning, match="Warning. An existing `great_expectations.yml` was found"
    ):
        result = runner.invoke(
            cli,
            ["init", "-d", project_dir],
            input="\n\n2\n6\nsqlite\n{}\n\n\n1\nmy_suite\n\n\n\n".format(url),
            catch_exceptions=False,
        )
    stdout = result.stdout

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )

    assert "Error: invalid input" not in stdout
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert (
        "Next, we will configure database credentials and store them in the `sqlite` section"
        in stdout
    )
    assert "What is the url/connection string for the sqlalchemy connection?" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "This looks like an existing project that" not in stdout

    config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML))
    assert "sqlite" in config["datasources"].keys()

    context = DataContext(ge_dir)
    assert context.list_datasources() == [
        {
            "class_name": "SqlAlchemyDatasource",
            "name": "sqlite",
            "module_name": "great_expectations.datasource",
            "credentials": {"url": url},
            "data_asset_type": {
                "class_name": "SqlAlchemyDataset",
                "module_name": "great_expectations.dataset",
            },
        }
    ]
    assert context.list_expectation_suites()[0].expectation_suite_name == "my_suite"
    assert len(context.list_expectation_suites()) == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #28
0
def test_suite_edit_one_datasources_no_generator_with_no_additional_args_and_no_citations(
    mock_webbrowser,
    mock_subprocess,
    caplog,
    monkeypatch,
    empty_data_context,
    filesystem_csv_2,
):
    """
    Here we verify that the "suite edit" command helps the user to specify the batch
    kwargs when it is called without the optional arguments that specify the batch.

    First, we call the "suite new" command to create the expectation suite our test
    will edit - this step is a just a setup.

    We call the "suite edit" command without any optional arguments. This means that
    the command will help us specify the batch kwargs interactively.

    The data context has one datasource. The datasource has no generators
    configured. The command prompts us to enter the file path.
    """
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
    )

    context = empty_data_context
    project_root_dir = context.root_directory

    root_dir = project_root_dir
    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        ["--v3-api", "suite", "new", "--no-jupyter"],
        input="{:s}\nmy_new_suite\n\n".format(os.path.join(filesystem_csv_2, "f1.csv")),
        catch_exceptions=False,
    )
    stdout = result.stdout
    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 0
    mock_subprocess.reset_mock()
    mock_webbrowser.reset_mock()
    assert result.exit_code == 0
    assert (
        "Great Expectations will create a new Expectation Suite 'my_new_suite' and store it here:"
        in stdout
    )

    # remove the citations from the suite
    context = DataContext(project_root_dir)
    suite = context.get_expectation_suite("my_new_suite")
    suite.meta.pop("citations")
    context.save_expectation_suite(suite)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        [
            "--v3-api",
            "suite",
            "edit",
            "my_new_suite",
        ],
        input="{:s}\n\n".format(os.path.join(filesystem_csv_2, "f1.csv")),
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Select a datasource" not in stdout
    assert "Which data would you like to use" not in stdout
    assert "Enter the path" in stdout

    expected_notebook_path = os.path.join(
        root_dir, "uncommitted", "edit_my_new_suite.ipynb"
    )
    assert os.path.isfile(expected_notebook_path)

    expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json")
    assert os.path.isfile(expected_suite_path)

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 1

    assert_no_logging_messages_or_tracebacks(
        my_caplog=caplog,
        click_result=result,
    )
def test_notebook_execution_with_pandas_backend(
    titanic_data_context_no_data_docs_no_checkpoint_store, ):
    """
    This tests that the notebook is written to disk and executes without error.

    To set this test up we:
    - create a scaffold notebook
    - verify that no validations have happened

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    # Since we'll run the notebook, we use a context with no data docs to avoid
    # the renderer's default behavior of building and opening docs, which is not
    # part of this test.
    context = titanic_data_context_no_data_docs_no_checkpoint_store
    root_dir = context.root_directory
    uncommitted_dir = os.path.join(root_dir, "uncommitted")
    suite_name = "my_suite"
    suite = context.create_expectation_suite(suite_name)

    csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv")
    batch_kwargs = {"datasource": "mydatasource", "path": csv_path}

    # Sanity check test setup
    assert context.list_expectation_suite_names() == [suite_name]
    assert context.list_datasources() == [{
        "module_name": "great_expectations.datasource",
        "class_name": "PandasDatasource",
        "data_asset_type": {
            "module_name": "great_expectations.dataset",
            "class_name": "PandasDataset",
        },
        "batch_kwargs_generators": {
            "mygenerator": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": "../data",
            }
        },
        "name": "mydatasource",
    }]
    assert context.get_validation_result(suite_name) == {}
    notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb")
    assert not os.path.isfile(notebook_path)

    # Create notebook
    renderer = SuiteScaffoldNotebookRenderer(
        titanic_data_context_no_data_docs_no_checkpoint_store, suite,
        batch_kwargs)
    renderer.render_to_disk(notebook_path)
    assert os.path.isfile(notebook_path)

    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)

    # Run notebook
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    # Useful to inspect executed notebook
    output_notebook = os.path.join(uncommitted_dir, "output.ipynb")
    with open(output_notebook, "w") as f:
        nbformat.write(nb, f)

    # Assertions about output
    context = DataContext(root_dir)
    obs_validation_result = context.get_validation_result(suite_name)
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100,
    }
    suite = context.get_expectation_suite(suite_name)

    assert suite.expectations
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations
コード例 #30
0
def test_build_data_docs_skipping_index_does_not_build_index(
    tmp_path_factory, ):
    # TODO What's the latest and greatest way to use configs rather than my hackery?
    empty_directory = str(tmp_path_factory.mktemp("empty"))
    DataContext.create(empty_directory)
    ge_dir = os.path.join(empty_directory, DataContext.GE_DIR)
    context = DataContext(ge_dir)
    config = context.get_config()
    config.data_docs_sites = {
        "local_site": {
            "class_name": "SiteBuilder",
            "store_backend": {
                "class_name": "TupleFilesystemStoreBackend",
                "base_directory": os.path.join("uncommitted", "data_docs"),
            },
        },
    }
    context._project_config = config

    # TODO Workaround project config programmatic config manipulation
    #  statefulness issues by writing to disk and re-upping a new context
    context._save_project_config()
    del context
    context = DataContext(ge_dir)
    data_docs_dir = os.path.join(ge_dir, "uncommitted", "data_docs")
    index_path = os.path.join(data_docs_dir, "index.html")
    assert not os.path.isfile(index_path)

    context.build_data_docs(build_index=False)
    assert os.path.isdir(os.path.join(data_docs_dir, "static"))
    assert not os.path.isfile(index_path)