def _suite_scaffold(suite: str, directory: str, jupyter: bool) -> None:
    usage_event = "cli.suite.scaffold"
    suite_name = suite
    context = load_data_context_with_error_handling(directory)
    notebook_filename = f"scaffold_{suite_name}.ipynb"
    notebook_path = _get_notebook_path(context, notebook_filename)

    if suite_name in context.list_expectation_suite_names():
        toolkit.tell_user_suite_exists(suite_name)
        if os.path.isfile(notebook_path):
            cli_message(
                f"  - If you wish to adjust your scaffolding, you can open this notebook with jupyter: `{notebook_path}` <red>(Please note that if you run that notebook, you will overwrite your existing suite.)</red>"
            )
        send_usage_message(data_context=context, event=usage_event, success=False)
        sys.exit(1)

    datasource = toolkit.select_datasource(context)
    if datasource is None:
        send_usage_message(data_context=context, event=usage_event, success=False)
        sys.exit(1)

    _suite = context.create_expectation_suite(suite_name)
    _, _, _, batch_kwargs = get_batch_kwargs(context, datasource_name=datasource.name)
    renderer = SuiteScaffoldNotebookRenderer(context, _suite, batch_kwargs)
    renderer.render_to_disk(notebook_path)

    if jupyter:
        toolkit.launch_jupyter_notebook(notebook_path)
    else:
        cli_message(
            f"To continue scaffolding this suite, run `jupyter notebook {notebook_path}`"
        )

    send_usage_message(data_context=context, event=usage_event, success=True)
def test_notebook_execution_with_pandas_backend(
        titanic_data_context_no_data_docs):
    """
    This tests that the notebook is written to disk and executes without error.

    To set this test up we:
    - create a scaffold notebook
    - verify that no validations have happened

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    # Since we'll run the notebook, we use a context with no data docs to avoid
    # the renderer's default behavior of building and opening docs, which is not
    # part of this test.
    context = titanic_data_context_no_data_docs
    root_dir = context.root_directory
    uncommitted_dir = os.path.join(root_dir, "uncommitted")
    suite_name = "my_suite"
    suite = context.create_expectation_suite(suite_name)

    csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv")
    batch_kwargs = {"datasource": "mydatasource", "path": csv_path}

    # Sanity check test setup
    assert context.list_expectation_suite_names() == [suite_name]
    assert context.list_datasources() == [{
        "module_name": "great_expectations.datasource",
        "class_name": "PandasDatasource",
        "data_asset_type": {
            "module_name": "great_expectations.dataset",
            "class_name": "PandasDataset",
        },
        "batch_kwargs_generators": {
            "mygenerator": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": "../data",
            }
        },
        "name": "mydatasource",
    }]
    assert context.get_validation_result(suite_name) == {}
    notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb")
    assert not os.path.isfile(notebook_path)

    # Create notebook
    renderer = SuiteScaffoldNotebookRenderer(titanic_data_context_no_data_docs,
                                             suite, batch_kwargs)
    renderer.render_to_disk(notebook_path)
    assert os.path.isfile(notebook_path)

    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)

    # Run notebook
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    # Useful to inspect executed notebook
    output_notebook = os.path.join(uncommitted_dir, "output.ipynb")
    with open(output_notebook, "w") as f:
        nbformat.write(nb, f)

    # Assertions about output
    context = DataContext(root_dir)
    obs_validation_result = context.get_validation_result(suite_name)
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 3,
        "successful_expectations": 3,
        "unsuccessful_expectations": 0,
        "success_percent": 100,
    }
    suite = context.get_expectation_suite(suite_name)
    assert suite.expectations
def test_render_snapshot_test(
        titanic_data_context_stats_enabled_no_config_store):
    batch_kwargs = (
        titanic_data_context_stats_enabled_no_config_store.build_batch_kwargs(
            "mydatasource", "mygenerator", "Titanic"))
    csv_path = batch_kwargs["path"]
    suite_name = "my_suite"
    suite = titanic_data_context_stats_enabled_no_config_store.create_expectation_suite(
        suite_name)
    renderer = SuiteScaffoldNotebookRenderer(
        titanic_data_context_stats_enabled_no_config_store, suite,
        batch_kwargs)
    obs = renderer.render(None)
    assert isinstance(obs, nbformat.NotebookNode)
    ## NOTE!!! - When updating this snapshot be sure to include the dynamic
    # csv_path in the second cell due to pytest fixtures
    expected = {
        "nbformat":
        4,
        "nbformat_minor":
        4,
        "metadata": {},
        "cells": [
            {
                "cell_type": "markdown",
                "source": """# Scaffold a new Expectation Suite (Experimental)
This process helps you avoid writing lots of boilerplate when authoring suites by allowing you to select columns you care about and letting a profiler write some candidate expectations for you to adjust.

**Expectation Suite Name**: `my_suite`

We'd love it if you'd **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)!""",
                "metadata": {},
            },
            {
                "cell_type":
                "code",
                "metadata": {},
                "execution_count":
                None,
                "source":
                'import datetime\nimport great_expectations as ge\nimport great_expectations.jupyter_ux\nfrom great_expectations.checkpoint import LegacyCheckpoint\nfrom great_expectations.profile import BasicSuiteBuilderProfiler\nfrom great_expectations.data_context.types.resource_identifiers import (\n    ValidationResultIdentifier,\n)\n\ncontext = ge.data_context.DataContext()\n\nexpectation_suite_name = "my_suite"\nsuite = context.create_expectation_suite(\n    expectation_suite_name, overwrite_existing=True\n)\n\nbatch_kwargs = {\n    "path": "'
                + csv_path +
                '",\n    "datasource": "mydatasource",\n    "data_asset_name": "Titanic",\n}\nbatch = context.get_batch(batch_kwargs, suite)\nbatch.head()',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                """## Select the columns on which you would like to scaffold expectations

Great Expectations will choose which expectations might make sense for a column based on the **data type** and **cardinality** of the data in each selected column.

Simply uncomment columns that are important. You can select multiple lines and
use a jupyter keyboard shortcut to toggle each line: **Linux/Windows**:
`Ctrl-/`, **macOS**: `Cmd-/`""",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                "included_columns = [\n    # 'Unnamed: 0',\n    # 'Name',\n    # 'PClass',\n    # 'Age',\n    # 'Sex',\n    # 'Survived',\n    # 'SexCode'\n]",
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source": """## Run the scaffolder

The suites generated here are **not meant to be production suites** - they are **scaffolds to build upon**.

**To get to a production grade suite, you will definitely want to [edit this
suite](https://docs.greatexpectations.io/en/latest/guides/how_to_guides/creating_and_editing_expectations/how_to_edit_an_expectation_suite_using_a_disposable_notebook.html?utm_source=notebook&utm_medium=scaffold_expectations)
after scaffolding gets you close to what you want.**

This is highly configurable depending on your goals. You can include or exclude
columns, and include or exclude expectation types (when applicable). [The
Expectation Glossary](https://docs.greatexpectations.io/en/latest/reference/glossary_of_expectations.html?utm_source=notebook&utm_medium=scaffold_expectations)
contains a list of possible expectations.""",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                '# Wipe the suite clean to prevent unwanted expectations in the batch\nsuite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)\nbatch = context.get_batch(batch_kwargs, suite)\n\n# In the scaffold_config, included or excluded expectation names should be strings.\nscaffold_config = {\n    "included_columns": included_columns,\n    # "excluded_columns": [],\n    # "included_expectations": [],\n    # "excluded_expectations": [],\n}\nsuite, evr = BasicSuiteBuilderProfiler().profile(batch, profiler_configuration=scaffold_config)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Save & review the scaffolded Expectation Suite\n\nLet's save the scaffolded expectation suite as a JSON file in the\n`great_expectations/expectations` directory of your project and rebuild the Data\n Docs site to make it easy to review the scaffolded suite.",
                "metadata": {},
            },
            {
                "cell_type":
                "code",
                "metadata": {},
                "execution_count":
                None,
                "source":
                "context.save_expectation_suite(suite, expectation_suite_name)\n\nresults = "
                'LegacyCheckpoint(\n    name="_temp_checkpoint",\n    data_context=context,\n    batches=[\n        {\n          "batch_kwargs": batch_kwargs,\n          "expectation_suite_names": [expectation_suite_name]\n        }\n    ],\n    validation_operator_name="action_list_operator"\n).run()\nvalidation_result_identifier = results.list_validation_result_identifiers()[0]\ncontext.build_data_docs()\ncontext.open_data_docs(validation_result_identifier)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Next steps\nAfter you review this scaffolded Expectation Suite in Data Docs you\nshould edit this suite to make finer grained adjustments to the expectations.\nThis can be done by running `great_expectations suite edit my_suite`.",
                "metadata": {},
            },
        ],
    }
    del expected["nbformat_minor"]
    del obs["nbformat_minor"]

    for obs_cell, expected_cell in zip(obs["cells"], expected["cells"]):
        obs_cell.pop("id", None)
        assert obs_cell == expected_cell
    assert obs == expected
Beispiel #4
0
def test_render_snapshot_test(titanic_data_context):
    batch_kwargs = titanic_data_context.build_batch_kwargs(
        "mydatasource", "mygenerator", "Titanic")
    csv_path = batch_kwargs["path"]
    suite_name = "my_suite"
    suite = titanic_data_context.create_expectation_suite(suite_name)
    renderer = SuiteScaffoldNotebookRenderer(titanic_data_context, suite,
                                             batch_kwargs)
    obs = renderer.render(None)
    assert isinstance(obs, nbformat.NotebookNode)
    ## NOTE!!! - When updating this snapshot be sure to include the dynamic
    # csv_path in the second cell due to pytest fixtures
    expected = {
        "nbformat":
        4,
        "nbformat_minor":
        4,
        "metadata": {},
        "cells": [
            {
                "cell_type": "markdown",
                "source":
                "# Scaffold a new Expectation Suite (BETA)\nUse this notebook to scaffold a new expectations suite. This process helps you\navoid writing lots of boilerplate when authoring suites.\n\n**Expectation Suite Name**: `my_suite`\n\nWe'd love it if you **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)",
                "metadata": {},
            },
            {
                "cell_type":
                "code",
                "metadata": {},
                "execution_count":
                None,
                "source":
                'from datetime import datetime\nimport great_expectations as ge\nimport great_expectations.jupyter_ux\nfrom great_expectations.profile import BasicSuiteBuilderProfiler\nfrom great_expectations.data_context.types.resource_identifiers import (\n    ValidationResultIdentifier,\n)\n\ncontext = ge.data_context.DataContext()\n\nexpectation_suite_name = "my_suite"\nsuite = context.create_expectation_suite(\n    expectation_suite_name, overwrite_existing=True\n)\n\nbatch_kwargs = {\n    "path": "'
                + csv_path +
                '",\n    "datasource": "mydatasource",\n}\nbatch = context.get_batch(batch_kwargs, suite)\nbatch.head()',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Select the columns you want to scaffold expectations on\n\nSimply uncomment columns that are important. You can select multiple lines and\nuse a jupyter keyboard shortcut to toggle each line: **Linux/Windows**:\n`Ctrl-/`, **macOS**: `Cmd-/`",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                "included_columns = [\n    # 'Unnamed: 0',\n    # 'Name',\n    # 'PClass',\n    # 'Age',\n    # 'Sex',\n    # 'Survived',\n    # 'SexCode'\n]",
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Run the scaffolder\n\nThis is highly configurable depending on your goals. You can include or exclude\ncolumns, and include or exclude expectation types (when applicable). [The \nExpectation Glossary](http://docs.greatexpectations.io/en/latest/expectation_glossary.html) \ncontains a list of possible expectations.\n\nNote that the profiler is not very smart, so it does it's best to decide on\napplicability.\n\n**To get to a production grade suite, you should [edit this \nsuite](http://docs.greatexpectations.io/en/latest/command_line.html#great-expectations-suite-edit) \nafter this scaffold gets you close to what you want.**",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                '# Wipe the suite clean to prevent unwanted expectations on the batch\nsuite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)\nbatch = context.get_batch(batch_kwargs, suite)\n\nscaffold_config = {\n    "included_columns": included_columns,\n    # "excluded_columns": [],\n    # "included_expectations": [],\n    # "excluded_expectations": [],\n}\nsuite, evr = BasicSuiteBuilderProfiler().profile(batch, profiler_configuration=scaffold_config)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Save & review the scaffolded Expectation Suite\n\nLet's save the scaffolded expectation suite as a JSON file in the \n`great_expectations/expectations` directory of your project and rebuild the Data\n Docs site to make reviewing the scaffolded suite easy.",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                'context.save_expectation_suite(suite, expectation_suite_name)\n\n# Let\'s make a simple sortable timestamp. Note this could come from your pipeline runner.\nrun_id = datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")\n\nresults = context.run_validation_operator("action_list_operator", assets_to_validate=[batch], run_id=run_id)\nexpectation_suite_identifier = list(results["details"].keys())[0]\nvalidation_result_identifier = ValidationResultIdentifier(\n    expectation_suite_identifier=expectation_suite_identifier,\n    batch_identifier=batch.batch_kwargs.to_id(),\n    run_id=run_id\n)\ncontext.build_data_docs()\ncontext.open_data_docs(validation_result_identifier)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Next steps\nAfter you are happy with this scaffolded Expectation Suite in Data Docs you \nshould edit this suite to make finer grained adjustments to the expectations. \nThis is be done by running `great_expectations suite edit my_suite`.",
                "metadata": {},
            },
        ],
    }
    del expected["nbformat_minor"]
    del obs["nbformat_minor"]

    for obs_cell, expected_cell in zip(obs["cells"], expected["cells"]):
        assert obs_cell == expected_cell
    assert obs == expected
def test_render_snapshot_test(titanic_data_context):
    batch_kwargs = titanic_data_context.build_batch_kwargs(
        "mydatasource", "mygenerator", "Titanic")
    csv_path = batch_kwargs["path"]
    suite_name = "my_suite"
    suite = titanic_data_context.create_expectation_suite(suite_name)
    renderer = SuiteScaffoldNotebookRenderer(titanic_data_context, suite,
                                             batch_kwargs)
    obs = renderer.render(None)
    assert isinstance(obs, nbformat.NotebookNode)
    ## NOTE!!! - When updating this snapshot be sure to include the dynamic
    # csv_path in the second cell due to pytest fixtures
    expected = {
        "nbformat":
        4,
        "nbformat_minor":
        4,
        "metadata": {},
        "cells": [
            {
                "cell_type": "markdown",
                "source": """# Scaffold a new Expectation Suite (Experimental)
This process helps you avoid writing lots of boilerplate when authoring suites by allowing you to select columns you care about and letting a profiler write some candidate expectations for you to adjust.

**Expectation Suite Name**: `my_suite`

We'd love it if you **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)""",
                "metadata": {},
            },
            {
                "cell_type":
                "code",
                "metadata": {},
                "execution_count":
                None,
                "source":
                'import datetime\nimport great_expectations as ge\nimport great_expectations.jupyter_ux\nfrom great_expectations.profile import BasicSuiteBuilderProfiler\nfrom great_expectations.data_context.types.resource_identifiers import (\n    ValidationResultIdentifier,\n)\n\ncontext = ge.data_context.DataContext()\n\nexpectation_suite_name = "my_suite"\nsuite = context.create_expectation_suite(\n    expectation_suite_name, overwrite_existing=True\n)\n\nbatch_kwargs = {\n    "path": "'
                + csv_path +
                '",\n    "datasource": "mydatasource",\n    "data_asset_name": "Titanic",\n}\nbatch = context.get_batch(batch_kwargs, suite)\nbatch.head()',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                """## Select the columns you want to scaffold expectations on

Great Expectations will choose which expectations might make sense for a column based on the **data type** and **cardinality** of the data in each selected column.

Simply uncomment columns that are important. You can select multiple lines and
use a jupyter keyboard shortcut to toggle each line: **Linux/Windows**:
`Ctrl-/`, **macOS**: `Cmd-/`""",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                "included_columns = [\n    # 'Unnamed: 0',\n    # 'Name',\n    # 'PClass',\n    # 'Age',\n    # 'Sex',\n    # 'Survived',\n    # 'SexCode'\n]",
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source": """## Run the scaffolder

The suites generated here are **not meant to be production suites** - they are **scaffolds to build upon**.

**To get to a production grade suite, will definitely want to [edit this
suite](http://docs.greatexpectations.io/en/latest/command_line.html#great-expectations-suite-edit)
after scaffolding gets you close to what you want.**

This is highly configurable depending on your goals. You can include or exclude
columns, and include or exclude expectation types (when applicable). [The
Expectation Glossary](https://docs.greatexpectations.io/en/latest/reference/glossary_of_expectations.html?utm_source=notebook&utm_medium=scaffold_expectations)
contains a list of possible expectations.""",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                '# Wipe the suite clean to prevent unwanted expectations on the batch\nsuite = context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)\nbatch = context.get_batch(batch_kwargs, suite)\n\nscaffold_config = {\n    "included_columns": included_columns,\n    # "excluded_columns": [],\n    # "included_expectations": [],\n    # "excluded_expectations": [],\n}\nsuite, evr = BasicSuiteBuilderProfiler().profile(batch, profiler_configuration=scaffold_config)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Save & review the scaffolded Expectation Suite\n\nLet's save the scaffolded expectation suite as a JSON file in the\n`great_expectations/expectations` directory of your project and rebuild the Data\n Docs site to make reviewing the scaffolded suite easy.",
                "metadata": {},
            },
            {
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "source":
                'context.save_expectation_suite(suite, expectation_suite_name)\n\n"""\nLet\'s create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\narguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\nrunner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\nNote - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\nbe None and run_time will default to the current UTC datetime.\n"""\n\nrun_id = {\n  "run_name": "some_string_that_uniquely_identifies_this_run",  # insert your own run_name here\n  "run_time": datetime.datetime.now(datetime.timezone.utc)\n}\n\nresults = context.run_validation_operator("action_list_operator", assets_to_validate=[batch], run_id=run_id)\nvalidation_result_identifier = results.list_validation_result_identifiers()[0]\ncontext.build_data_docs()\ncontext.open_data_docs(validation_result_identifier)',
                "outputs": [],
            },
            {
                "cell_type": "markdown",
                "source":
                "## Next steps\nAfter you are happy with this scaffolded Expectation Suite in Data Docs you\nshould edit this suite to make finer grained adjustments to the expectations.\nThis is be done by running `great_expectations suite edit my_suite`.",
                "metadata": {},
            },
        ],
    }
    del expected["nbformat_minor"]
    del obs["nbformat_minor"]

    for obs_cell, expected_cell in zip(obs["cells"], expected["cells"]):
        assert obs_cell == expected_cell
    assert obs == expected