コード例 #1
0
def initialized_sqlite_project(mock_webbrowser, caplog, tmp_path_factory,
                               titanic_sqlite_db_file):
    """This is an initialized project through the CLI."""
    project_dir = str(tmp_path_factory.mktemp("my_rad_project"))

    engine = create_engine("sqlite:///{}".format(titanic_sqlite_db_file))

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format(engine.url),
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
        project_dir) in mock_webbrowser.call_args[0][0]

    assert_no_logging_messages_or_tracebacks(caplog, result)

    context = DataContext(os.path.join(project_dir, DataContext.GE_DIR))
    assert isinstance(context, DataContext)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "titanic"
    }]
    return project_dir
コード例 #2
0
def initialized_sqlite_project(
    mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file, sa
):
    """This is an initialized project through the CLI."""
    project_dir = str(tmp_path_factory.mktemp("my_rad_project"))

    engine = sa.create_engine(
        "sqlite:///{}".format(titanic_sqlite_db_file), pool_recycle=3600
    )

    inspector = sa.inspect(engine)

    # get the default schema and table for testing
    schemas = inspector.get_schema_names()
    default_schema = schemas[0]

    tables = [
        table_name for table_name in inspector.get_table_names(schema=default_schema)
    ]
    default_table = tables[0]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="\n\n2\n6\ntitanic\n{url}\n\n\n1\n{schema}\n{table}\nwarning\n\n\n\n".format(
            url=engine.url, schema=default_schema, table=default_table
        ),
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )

    assert_no_logging_messages_or_tracebacks(caplog, result)

    context = DataContext(os.path.join(project_dir, DataContext.GE_DIR))
    assert isinstance(context, DataContext)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [
        {
            "class_name": "SqlAlchemyDatasource",
            "name": "titanic",
            "module_name": "great_expectations.datasource",
            "credentials": {"url": str(engine.url)},
            "data_asset_type": {
                "class_name": "SqlAlchemyDataset",
                "module_name": "great_expectations.dataset",
            },
        }
    ]
    return project_dir
コード例 #3
0
def test_cli_datasource_new_connection_string(
    mock_subprocess, empty_data_context, empty_sqlite_db, caplog, monkeypatch
):
    root_dir = empty_data_context.root_directory
    context = DataContext(root_dir)
    assert context.list_datasources() == []

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource new",
        input=f"2\n6\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert "What data would you like Great Expectations to connect to?" in stdout

    assert result.exit_code == 0

    uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR)
    expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb")
    assert os.path.isfile(expected_notebook)
    mock_subprocess.assert_called_once_with(["jupyter", "notebook", expected_notebook])

    # Run notebook
    with open(expected_notebook) as f:
        nb = nbformat.read(f, as_version=4)

    # mock the user adding a connection string into the notebook by overwriting the right cell
    assert "connection_string" in nb["cells"][5]["source"]
    nb["cells"][5]["source"] = 'connection_string = "sqlite://"'
    ep = ExecutePreprocessor(timeout=60, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    del context
    context = DataContext(root_dir)
    assert context.list_datasources() == [
        {
            "class_name": "SimpleSqlalchemyDatasource",
            "connection_string": "sqlite://",
            "introspection": {
                "whole_table": {"data_asset_name_suffix": "__whole_table"}
            },
            "module_name": "great_expectations.datasource",
            "name": "my_datasource",
        }
    ]

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #4
0
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing(
    mock_webbrowser,
    caplog,
    initialized_sqlite_project,
    titanic_sqlite_db,
    empty_sqlite_db,
):
    project_dir = initialized_sqlite_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    context = DataContext(ge_dir)
    datasource_name = "wow_a_datasource"
    context = _add_datasource_and_credentials_to_context(
        context, datasource_name, empty_sqlite_db
    )
    assert len(context.list_datasources()) == 2

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False,
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 0

    assert "Error: invalid input" not in stdout

    assert "Always know what to expect from your data" in stdout
    assert "This looks like an existing project that" in stdout
    assert "appears complete" in stdout
    assert "Would you like to build & view this project's Data Docs" in stdout

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #5
0
def test_cli_datasorce_list(empty_data_context, empty_sqlite_db, caplog):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)

    stdout = result.output.strip()
    assert "[]" in stdout
    assert context.list_datasources() == []

    datasource_name = "wow_a_datasource"
    _add_datasource_and_credentials_to_context(context, datasource_name,
                                               empty_sqlite_db)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)
    stdout = result.output.strip()
    if PY2:
        # deal with legacy python dictionary sorting
        assert ("'name': 'wow_a_datasource'"
                and "'class_name': u'SqlAlchemyDatasource'" in stdout)
        assert len(stdout) >= 60 and len(stdout) <= 75
    else:
        assert (
            "[{'name': 'wow_a_datasource', 'class_name': 'SqlAlchemyDatasource'}]"
            in stdout)

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #6
0
def test_cli_datasource_delete_on_project_with_one_datasource_declining_prompt_does_not_delete(
    caplog,
    monkeypatch,
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled
    assert "my_datasource" in [ds["name"] for ds in context.list_datasources()]
    assert len(context.list_datasources()) == 1

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource delete my_datasource",
        input="n\n",
        catch_exceptions=False,
    )

    stdout = result.output
    assert result.exit_code == 0
    assert "Using v3 (Batch Request) API" in stdout
    assert "Datasource `my_datasource` was not deleted." in stdout

    # reload context from disk to see if the datasource was in fact deleted
    root_directory = context.root_directory
    del context
    context = DataContext(root_directory)
    assert len(context.list_datasources()) == 1
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #7
0
def test_cli_datasource_delete_on_project_with_one_datasource_assume_yes_flag(
    caplog,
    monkeypatch,
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled
    assert "my_datasource" in [ds["name"] for ds in context.list_datasources()]
    assert len(context.list_datasources()) == 1

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api --assume-yes datasource delete my_datasource",
        catch_exceptions=False,
    )

    stdout = result.output
    assert result.exit_code == 0

    assert "Would you like to proceed? [Y/n]:" not in stdout
    # This assertion is extra assurance since this test is too permissive if we change the confirmation message
    assert "[Y/n]" not in stdout

    assert "Using v3 (Batch Request) API" in stdout
    assert "Datasource deleted successfully." in stdout

    # reload context from disk to see if the datasource was in fact deleted
    root_directory = context.root_directory
    del context
    context = DataContext(root_directory)
    assert len(context.list_datasources()) == 0
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #8
0
def test_cli_datasorce_new(caplog, empty_data_context, filesystem_csv_2):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    assert context.list_datasources() == []

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["datasource", "new", "-d", project_root_dir],
        input="1\n1\n%s\nmynewsource\n" % str(filesystem_csv_2),
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert "What data would you like Great Expectations to connect to?" in stdout
    assert "What are you processing your files with?" in stdout
    assert "Give your new data source a short name." in stdout
    assert "A new datasource 'mynewsource' was added to your project." in stdout

    assert result.exit_code == 0

    config_path = os.path.join(project_root_dir, DataContext.GE_YML)
    config = yaml.load(open(config_path, "r"))
    datasources = config["datasources"]
    assert "mynewsource" in datasources.keys()
    data_source_class = datasources["mynewsource"]["data_asset_type"]["class_name"]
    assert data_source_class == "PandasDataset"
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #9
0
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one(
    mock_webbrowser,
    capsys,
    caplog,
    initialized_project,
):
    project_dir = initialized_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    # mangle the project to remove all traces of a suite and validations
    _remove_all_datasources(ge_dir)
    os.remove(os.path.join(ge_dir, "expectations", "Titanic", "warning.json"))
    uncommitted_dir = os.path.join(ge_dir, "uncommitted")
    validations_dir = os.path.join(ge_dir, uncommitted_dir, "validations")
    shutil.rmtree(validations_dir)
    os.mkdir(validations_dir)
    shutil.rmtree(os.path.join(uncommitted_dir, "data_docs", "local_site"))
    context = DataContext(ge_dir)
    assert not context.list_expectation_suites()

    csv_path = os.path.join(project_dir, "data", "Titanic.csv")
    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="1\n1\n{}\nmy_suite\n\n".format(csv_path,
                                              catch_exceptions=False),
        catch_exceptions=False,
    )
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format(
        project_dir) in mock_webbrowser.call_args[0][0]
    stdout = result.stdout

    assert result.exit_code == 0

    assert "Error: invalid input" not in stdout
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Enter the path (relative or absolute) of a data file" in stdout
    assert "Name the new expectation suite [Titanic.warning]:" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations"
        in stdout)
    assert "A new Expectation suite 'my_suite' was added to your project" in stdout
    assert "Great Expectations is now set up." in stdout

    config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML))
    assert "files_datasource" in config["datasources"].keys()

    context = DataContext(ge_dir)
    assert context.list_datasources() == [{
        "name": "files_datasource",
        "class_name": "PandasDatasource"
    }]
    assert context.list_expectation_suites(
    )[0].expectation_suite_name == "my_suite"
    assert len(context.list_expectation_suites()) == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #10
0
def select_datasource(context: DataContext,
                      datasource_name: str = None) -> Datasource:
    """Select a datasource interactively."""
    # TODO consolidate all the myriad CLI tests into this
    data_source = None

    if datasource_name is None:
        data_sources = sorted(context.list_datasources(),
                              key=lambda x: x["name"])
        if len(data_sources) == 0:
            cli_message(
                "<red>No datasources found in the context. To add a datasource, run `great_expectations datasource new`</red>"
            )
        elif len(data_sources) == 1:
            datasource_name = data_sources[0]["name"]
        else:
            choices = "\n".join([
                "    {}. {}".format(i, data_source["name"])
                for i, data_source in enumerate(data_sources, 1)
            ])
            option_selection = click.prompt(
                "Select a datasource" + "\n" + choices + "\n",
                type=click.Choice(
                    [str(i) for i, data_source in enumerate(data_sources, 1)]),
                show_choices=False,
            )
            datasource_name = data_sources[int(option_selection) - 1]["name"]

    if datasource_name is not None:
        data_source = context.get_datasource(datasource_name)

    return data_source
コード例 #11
0
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing(
    mock_webbrowser, caplog, initialized_project, filesystem_csv_2
):
    project_dir = initialized_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    context = DataContext(ge_dir)
    context.add_datasource(
        "another_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
    )

    assert len(context.list_datasources()) == 2

    runner = CliRunner(mix_stderr=False)
    with pytest.warns(
        UserWarning, match="Warning. An existing `great_expectations.yml` was found"
    ):
        result = runner.invoke(
            cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False,
        )
    stdout = result.stdout

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 0
    assert "Error: invalid input" not in stdout

    assert "Always know what to expect from your data" in stdout
    assert "This looks like an existing project that" in stdout
    assert "appears complete" in stdout
    assert "Would you like to build & view this project's Data Docs" in stdout

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #12
0
def initialized_project(mock_webbrowser, tmp_path_factory):
    """This is an initialized project through the CLI."""
    project_dir = str(tmp_path_factory.mktemp("my_rad_project"))
    os.makedirs(os.path.join(project_dir, "data"))
    data_folder_path = os.path.join(project_dir, "data")
    data_path = os.path.join(project_dir, "data/Titanic.csv")
    fixture_path = file_relative_path(__file__, "../test_sets/Titanic.csv")
    shutil.copy(fixture_path, data_path)
    runner = CliRunner(mix_stderr=False)
    _ = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format(
            data_folder_path, data_path),
        catch_exceptions=False,
    )
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/"
        .format(project_dir) in mock_webbrowser.call_args[0][0])

    context = DataContext(os.path.join(project_dir, DataContext.GE_DIR))
    assert isinstance(context, DataContext)
    assert len(context.list_datasources()) == 1
    return project_dir
コード例 #13
0
def test_cli_datasource_profile_with_valid_data_asset_arg(
        caplog, empty_data_context, filesystem_csv_2):
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )
    context = empty_data_context

    project_root_dir = context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            "my_datasource",
            "--data-assets",
            "f1",
            "-d",
            project_root_dir,
            "--no-view",
        ],
        catch_exceptions=False,
    )

    assert result.exit_code == 0
    stdout = result.stdout
    assert "Profiling 'my_datasource'" in stdout
    assert "The following Data Docs sites were built:\n- local_site:" in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (suites[0].expectation_suite_name ==
            "my_datasource.subdir_reader.f1.BasicDatasetProfiler")

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    suite_name = validation.meta["expectation_suite_name"]
    assert suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler"
    assert validation.success is False
    assert len(validation.results) == 8

    assert "Preparing column 1 of 1" in caplog.messages[0]
    assert len(caplog.messages) == 1
    assert_no_tracebacks(result)
コード例 #14
0
def test_cli_datasource_profile_with_skip_prompt_flag(caplog,
                                                      empty_data_context,
                                                      filesystem_csv_2):
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    not_so_empty_data_context = empty_data_context

    project_root_dir = not_so_empty_data_context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["datasource", "profile", "-d", project_root_dir, "--no-view", "-y"],
        input="Y\n",
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    stdout = result.stdout

    assert (
        "Profiling 'my_datasource' will create expectations and documentation."
        in stdout)
    assert "Would you like to profile 'my_datasource'" not in stdout
    assert (
        "Great Expectations is building Data Docs from the data you just profiled!"
        in stdout)

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (suites[0].expectation_suite_name ==
            "my_datasource.subdir_reader.f1.BasicDatasetProfiler")

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    assert (validation.meta["expectation_suite_name"] ==
            "my_datasource.subdir_reader.f1.BasicDatasetProfiler")
    assert validation.success is False
    assert len(validation.results) == 8

    assert "Preparing column 1 of 1" in caplog.messages[0]
    assert len(caplog.messages) == 1
    assert_no_tracebacks(result)
コード例 #15
0
def generate_basic_suites(
    kedro_context: KedroContext,
    ge_context: DataContext,
    empty=False,
    replace=False,
    batch_kwargs=None,
):
    from great_expectations.profile import BasicSuiteBuilderProfiler

    if batch_kwargs is None:
        batch_kwargs = {}
    catalog = kedro_context.catalog

    existing_datasource_names = {
        ds["name"]
        for ds in ge_context.list_datasources()
    }
    for dataset_name in catalog.list():
        suite_name = generate_basic_suite_name(dataset_name)
        if suite_name in ge_context.list_expectation_suite_names(
        ) and not replace:
            continue

        datasource_name = generate_datasource_name(dataset_name)
        if datasource_name not in existing_datasource_names:
            continue

        dataset = catalog._get_dataset(dataset_name)
        data_path = str(dataset._filepath)
        dataasset_name, _ = os.path.splitext(os.path.basename(data_path))

        suite_batch_kwargs = {
            "datasource": datasource_name,
            "data_asset_name": dataasset_name,
            "path": data_path,
            "reader_options": dataset._load_args,
        }

        batch_kwargs_generator_name = "path"
        profiler_configuration = "demo"
        additional_batch_kwargs = batch_kwargs

        run_id = datetime.datetime.now(
            datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")

        if empty:
            create_empty_suite(ge_context, suite_name, suite_batch_kwargs)
        else:
            ge_context.profile_data_asset(
                datasource_name,
                batch_kwargs_generator_name=batch_kwargs_generator_name,
                data_asset_name=dataasset_name,
                batch_kwargs=suite_batch_kwargs,
                profiler=BasicSuiteBuilderProfiler,
                profiler_configuration=profiler_configuration,
                expectation_suite_name=suite_name,
                run_id=run_id,
                additional_batch_kwargs=additional_batch_kwargs,
            )
コード例 #16
0
def test_cli_datasource_profile_with_valid_data_asset_arg(
    empty_data_context, titanic_sqlite_db, caplog
):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    datasource_name = "wow_a_datasource"
    context = _add_datasource_and_credentials_to_context(
        context, datasource_name, titanic_sqlite_db
    )

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            datasource_name,
            "--data-assets",
            "main.titanic",
            "-d",
            project_root_dir,
            "--no-view",
        ],
        catch_exceptions=False,
    )

    stdout = result.stdout
    assert result.exit_code == 0
    assert "Profiling '{}'".format(datasource_name) in stdout
    assert "The following Data Docs sites were built:\n" in stdout
    assert "local_site:" in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (
        suites[0].expectation_suite_name
        == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler"
    )

    validations_store = context.stores["validations_store"]
    validation_keys = validations_store.list_keys()
    assert len(validation_keys) == 1

    validation = validations_store.get(validation_keys[0])
    assert (
        validation.meta["expectation_suite_name"]
        == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler"
    )
    assert validation.success is False
    assert len(validation.results) == 51

    assert "Preparing column 1 of 7" in caplog.messages[0]
    assert len(caplog.messages) == 10
    assert_no_tracebacks(result)
コード例 #17
0
def test_cli_datasorce_list(caplog, empty_data_context, filesystem_csv_2):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)

    stdout = result.output.strip()
    assert "[]" in stdout
    assert context.list_datasources() == []

    context.add_datasource(
        "wow_a_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    assert context.list_datasources() == [{
        "name": "wow_a_datasource",
        "class_name": "PandasDatasource"
    }]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)

    stdout = result.output.strip()
    if PY2:
        # deal with legacy python dictionary sorting
        assert ("'name': 'wow_a_datasource'"
                and "'class_name': 'PandasDatasource'" in stdout)
        assert len(stdout) >= 60 and len(stdout) <= 70
    else:
        assert (
            "[{'name': 'wow_a_datasource', 'class_name': 'PandasDatasource'}]"
            in stdout)
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #18
0
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one(
    mock_webbrowser,
    caplog,
    initialized_sqlite_project,
    titanic_sqlite_db_file,
):
    project_dir = initialized_sqlite_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    _remove_all_datasources(ge_dir)
    os.remove(os.path.join(ge_dir, "expectations", "warning.json"))
    context = DataContext(ge_dir)
    assert not context.list_expectation_suites()

    runner = CliRunner(mix_stderr=False)
    with pytest.warns(
            UserWarning,
            match="Warning. An existing `great_expectations.yml` was found"):
        result = runner.invoke(
            cli,
            ["init", "-d", project_dir],
            input="2\n5\nsqlite\nsqlite:///{}\n1\nmy_suite\n\n".format(
                titanic_sqlite_db_file),
            catch_exceptions=False,
        )
    stdout = result.stdout

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format(
        project_dir) in mock_webbrowser.call_args[0][0]

    assert "Error: invalid input" not in stdout
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert (
        "Next, we will configure database credentials and store them in the `sqlite` section"
        in stdout)
    assert "What is the url/connection string for the sqlalchemy connection?" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "A new Expectation suite 'my_suite' was added to your project" in stdout
    assert "This looks like an existing project that" not in stdout

    config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML))
    assert "sqlite" in config["datasources"].keys()

    context = DataContext(ge_dir)
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "sqlite"
    }]
    assert context.list_expectation_suites(
    )[0].expectation_suite_name == "my_suite"
    assert len(context.list_expectation_suites()) == 1

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #19
0
def test_cli_datasource_profile_with_datasource_arg_and_generator_name_arg(
    empty_data_context, titanic_sqlite_db, caplog
):
    """
    Here we are verifying that when generator_name argument is passed to
    the methods down the stack.

    We use a datasource with two generators. This way we can check that the
    name of the expectation suite created by the profiler corresponds to
    the name of the data asset listed by the generator that we told the profiler
    to use.

    The logic of processing this argument is testing in tests/profile.
    """
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    datasource_name = "wow_a_datasource"
    context = _add_datasource__with_two_generators_and_credentials_to_context(
        context, datasource_name, titanic_sqlite_db
    )

    second_generator_name = "second_generator"

    runner = CliRunner()
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            datasource_name,
            "--generator-name",
            second_generator_name,
            "-d",
            project_root_dir,
            "--no-view",
        ],
        input="Y\n",
    )
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Profiling '{}'".format(datasource_name) in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 1
    assert (
        suites[0].expectation_suite_name
        == "wow_a_datasource.second_generator.asset_one.BasicDatasetProfiler"
    )

    assert "Preparing column 1 of 7" in caplog.messages[0]
    assert len(caplog.messages) == 7
    assert_no_tracebacks(result)
コード例 #20
0
def test_cli_datasource_delete_on_project_with_one_datasource_declining_prompt_does_not_delete(
    mock_emit,
    caplog,
    monkeypatch,
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled
    assert "my_datasource" in [ds["name"] for ds in context.list_datasources()]
    assert len(context.list_datasources()) == 1

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource delete my_datasource",
        input="n\n",
        catch_exceptions=False,
    )

    stdout = result.output
    assert result.exit_code == 0
    assert "Using v3 (Batch Request) API" in stdout
    assert "Datasource `my_datasource` was not deleted." in stdout

    expected_call_args_list = [
        mock.call({
            "event_payload": {},
            "event": "data_context.__init__",
            "success": True
        }),
        mock.call({
            "event": "cli.datasource.delete.begin",
            "event_payload": {
                "api_version": "v3"
            },
            "success": True,
        }),
        mock.call({
            "event": "cli.datasource.delete.end",
            "event_payload": {
                "cancelled": True,
                "api_version": "v3"
            },
            "success": True,
        }),
    ]

    assert mock_emit.call_args_list == expected_call_args_list
    assert mock_emit.call_count == len(expected_call_args_list)

    # reload context from disk to see if the datasource was in fact deleted
    root_directory = context.root_directory
    del context
    context = DataContext(root_directory)
    assert len(context.list_datasources()) == 1
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #21
0
def test_cli_datasource_list(empty_data_context, empty_sqlite_db, caplog,
                             monkeypatch):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        [
            "--v3-api",
            "datasource",
            "list",
        ],
        catch_exceptions=False,
    )

    stdout = result.output.strip()
    assert "No Datasources found" in stdout
    assert context.list_datasources() == []

    datasource_name = "wow_a_datasource"
    _add_datasource_and_credentials_to_context(context, datasource_name,
                                               empty_sqlite_db)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        ["--v3-api", "datasource", "list"],
        catch_exceptions=False,
    )
    url = str(empty_sqlite_db.engine.url)
    expected_output = """\
1 Datasource found:

 - name: wow_a_datasource
   module_name: great_expectations.datasource
   class_name: SqlAlchemyDatasource
   batch_kwargs_generators:
     default:
       class_name: TableBatchKwargsGenerator
   credentials:
     url: {}
   data_asset_type:
     class_name: SqlAlchemyDataset
     module_name: None
""".format(url).strip()
    stdout = result.output.strip()

    assert stdout == expected_output

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #22
0
def _remove_all_datasources(ge_dir):
    config_path = os.path.join(ge_dir, DataContext.GE_YML)

    config = _load_config_file(config_path)
    config["datasources"] = {}

    with open(config_path, "w") as f:
        yaml.dump(config, f)

    context = DataContext(ge_dir)
    assert context.list_datasources() == []
コード例 #23
0
def _run_cli_datasource_new_path_test(context: DataContext, args: str,
                                      invocation_input: str,
                                      base_path: str) -> None:
    root_dir = context.root_directory
    runner = CliRunner(mix_stderr=True)
    result = runner.invoke(
        cli,
        args=args,
        input=invocation_input,
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    _run_notebook(context)

    # Renew a context since we executed a notebook in a different process
    del context
    context = DataContext(root_dir)
    assert context.list_datasources() == [{
        "name": "my_datasource",
        "class_name": "Datasource",
        "module_name": "great_expectations.datasource",
        "execution_engine": {
            "module_name": "great_expectations.execution_engine",
            "class_name": "PandasExecutionEngine",
        },
        "data_connectors": {
            "default_inferred_data_connector_name": {
                "default_regex": {
                    "group_names": ["data_asset_name"],
                    "pattern": "(.*)",
                },
                "module_name": "great_expectations.datasource.data_connector",
                "base_directory": f"../../{base_path}",
                "class_name": "InferredAssetFilesystemDataConnector",
            },
            "default_runtime_data_connector_name": {
                "assets": {
                    "my_runtime_asset_name": {
                        "batch_identifiers": ["runtime_batch_identifier_name"],
                        "class_name":
                        "Asset",
                        "module_name":
                        "great_expectations.datasource.data_connector.asset",
                    }
                },
                "class_name": "RuntimeDataConnector",
                "module_name": "great_expectations.datasource.data_connector",
            },
        },
    }]
コード例 #24
0
def test_cli_datasource_profile_with_invalid_data_asset_arg_answering_no(
    caplog, empty_data_context, filesystem_csv_2
):
    empty_data_context.add_datasource(
        "my_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    not_so_empty_data_context = empty_data_context

    project_root_dir = not_so_empty_data_context.root_directory

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            "my_datasource",
            "--data-assets",
            "bad-bad-asset",
            "-d",
            project_root_dir,
            "--no-view",
        ],
        input="2\n",
        catch_exceptions=False,
    )

    stdout = result.stdout
    assert (
        "Some of the data assets you specified were not found: bad-bad-asset" in stdout
    )
    assert "Choose how to proceed" in stdout
    assert "Skipping profiling for now." in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 0
    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #25
0
def test_cli_datasource_list_on_project_with_no_datasources(
        caplog, monkeypatch, empty_data_context, filesystem_csv_2):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource list",
        catch_exceptions=False,
    )

    stdout = result.stdout.strip()
    assert "No Datasources found" in stdout
    assert context.list_datasources() == []
コード例 #26
0
def test_cli_datasource_profile_with_invalid_data_asset_arg_answering_no(
    empty_data_context, titanic_sqlite_db, caplog
):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    datasource_name = "wow_a_datasource"
    context = _add_datasource_and_credentials_to_context(
        context, datasource_name, titanic_sqlite_db
    )

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        [
            "datasource",
            "profile",
            datasource_name,
            "--data-assets",
            "bad-bad-asset",
            "-d",
            project_root_dir,
            "--no-view",
        ],
        input="2\n",
        catch_exceptions=False,
    )

    stdout = result.stdout
    assert (
        "Some of the data assets you specified were not found: bad-bad-asset" in stdout
    )
    assert "Choose how to proceed" in stdout
    assert "Skipping profiling for now." in stdout

    context = DataContext(project_root_dir)
    assert len(context.list_datasources()) == 1

    expectations_store = context.stores["expectations_store"]
    suites = expectations_store.list_keys()
    assert len(suites) == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #27
0
def test_cli_datasorce_new_connection_string(empty_data_context,
                                             empty_sqlite_db, caplog,
                                             monkeypatch):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    assert context.list_datasources() == []

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        [
            "--v3-api",
            "datasource",
            "new",
        ],
        input=f"2\n6\nmynewsource\n{empty_sqlite_db.url}\n\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    assert "What data would you like Great Expectations to connect to?" in stdout
    assert "Give your new Datasource a short name." in stdout
    assert (
        "Next, we will configure database credentials and store them in the `mynewsource` section"
        in stdout)
    assert "What is the url/connection string for the sqlalchemy connection?" in stdout
    assert "Attempting to connect to your database. This may take a moment" in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "A new datasource 'mynewsource' was added to your project." in stdout

    assert result.exit_code == 0

    config_path = os.path.join(project_root_dir, DataContext.GE_YML)
    config = yaml.load(open(config_path))
    datasources = config["datasources"]
    assert "mynewsource" in datasources.keys()
    data_source_class = datasources["mynewsource"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #28
0
def test_cli_datasource_list(empty_data_context, empty_sqlite_db, caplog, monkeypatch):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        "--v3-api datasource list",
        catch_exceptions=False,
    )

    stdout = result.stdout.strip()
    assert "No Datasources found" in stdout
    assert context.list_datasources() == []

    datasource_name = "wow_a_datasource"
    _add_datasource_and_credentials_to_context(
        context, datasource_name, empty_sqlite_db
    )

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(os.path.dirname(context.root_directory))
    result = runner.invoke(
        cli,
        ["--v3-api", "datasource", "list"],
        catch_exceptions=False,
    )
    expected_output = """\
Using v3 (Batch Request) API\x1b[0m
1 Datasource found:

 - name: wow_a_datasource
   class_name: SqlAlchemyDatasource
""".strip()
    stdout = result.stdout.strip()

    assert stdout == expected_output

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #29
0
ファイル: datasource.py プロジェクト: anishshah97/kedro-great
def generate_datasources(kedro_context: KedroContext,
                         ge_context: DataContext) -> List[str]:
    catalog = kedro_context.catalog
    new_datasources = []
    existing_datasource_names = {
        ds["name"]
        for ds in ge_context.list_datasources()
    }
    for dataset_name in catalog.list():
        datasource_name = generate_datasource_name(dataset_name)
        if datasource_name in existing_datasource_names:
            continue

        dataset = catalog._get_dataset(dataset_name)
        datasource_type = identify_dataset_type(dataset)

        if datasource_type == DatasourceTypes.PANDAS:
            name = _add_pandas_datasource(datasource_name, dataset, ge_context)
            new_datasources.append(name)
        elif datasource_type == DatasourceTypes.SPARK:
            name = _add_spark_datasource(datasource_name, dataset, ge_context)
            new_datasources.append(name)
    return new_datasources
コード例 #30
0
def test_cli_init_on_new_project(mock_webbrowser, caplog, tmp_path_factory,
                                 titanic_sqlite_db_file):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    ge_dir = os.path.join(project_dir, "great_expectations")

    database_path = os.path.join(project_dir, "titanic.db")
    shutil.copy(titanic_sqlite_db_file, database_path)
    engine = create_engine("sqlite:///{}".format(database_path))

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format(
            engine.url, catch_exceptions=False),
    )
    stdout = result.output
    assert len(stdout) < 3000, "CLI output is unreasonably long."

    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "Which database backend are you using" in stdout
    assert "Give your new data source a short name" in stdout
    assert "What is the url/connection string for the sqlalchemy connection" in stdout
    assert "Attempting to connect to your database." in stdout
    assert "Great Expectations connected to your database" in stdout
    assert "Which table would you like to use?" in stdout
    assert "Name the new expectation suite [main.titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout)
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "A new Expectation suite 'warning' was added to your project" in stdout
    assert "Great Expectations is now set up" in stdout

    context = DataContext(ge_dir)
    assert len(context.list_datasources()) == 1
    assert context.list_datasources() == [{
        "class_name": "SqlAlchemyDatasource",
        "name": "titanic"
    }]

    first_suite = context.list_expectation_suites()[0]
    suite = context.get_expectation_suite(first_suite.expectation_suite_name)
    assert len(suite.expectations) == 13

    assert os.path.isdir(ge_dir)
    config_path = os.path.join(project_dir,
                               "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["titanic"]["data_asset_type"][
        "class_name"]
    assert data_source_class == "SqlAlchemyDataset"

    obs_tree = gen_directory_tree_str(ge_dir)

    # Instead of monkey patching datetime, just regex out the time directories
    date_safe_obs_tree = re.sub(r"\d*T\d*\.\d*Z", "9999.9999", obs_tree)
    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(r"[a-z0-9]{32}(?=\.(json|html))",
                                "foobarbazguid", date_safe_obs_tree)
    assert (guid_safe_obs_tree == """\
great_expectations/
    .gitignore
    great_expectations.yml
    expectations/
        warning.json
    notebooks/
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    warning/
                        9999.9999/
                            foobarbazguid.html
        validations/
            warning/
                9999.9999/
                    foobarbazguid.json
""")

    assert_no_logging_messages_or_tracebacks(caplog, result)

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format(
        project_dir) in mock_webbrowser.call_args[0][0]