Esempio n. 1
0
def sanitize_yaml_and_save_datasource(
        context: DataContext,
        datasource_yaml: str,
        overwrite_existing: bool = False) -> None:
    """A convenience function used in notebooks to help users save secrets."""
    if not datasource_yaml:
        raise ValueError("Please verify the yaml and try again.")
    if not isinstance(datasource_yaml, str):
        raise TypeError("Please pass in a valid yaml string.")
    config = yaml.load(datasource_yaml)
    try:
        datasource_name = config.pop("name")
    except KeyError:
        raise ValueError("The datasource yaml is missing a `name` attribute.")
    if not overwrite_existing and check_if_datasource_name_exists(
            context=context, datasource_name=datasource_name):
        print(
            f'**WARNING** A Datasource named "{datasource_name}" already exists in this Data Context. The Datasource has *not* been saved. Please use a different name or set overwrite_existing=True if you want to overwrite!'
        )
        return
    if "credentials" in config.keys():
        credentials = config["credentials"]
        config["credentials"] = "${" + datasource_name + "}"
        context.save_config_variable(datasource_name, credentials)
    context.add_datasource(name=datasource_name, **config)
Esempio n. 2
0
def test_suite_edit_with_batch_kwargs_unable_to_load_a_batch_raises_helpful_error(
    mock_webbrowser, mock_subprocess, caplog, empty_data_context
):
    """
    The command should:
    - exit with a clear error message
    - NOT open Data Docs
    - NOT open jupyter
    """
    project_dir = empty_data_context.root_directory

    context = DataContext(project_dir)
    context.create_expectation_suite("foo")
    context.add_datasource("source", class_name="PandasDatasource")

    runner = CliRunner(mix_stderr=False)
    batch_kwargs = '{"table": "fake", "datasource": "source"}'
    result = runner.invoke(
        cli,
        ["suite", "edit", "foo", "-d", project_dir, "--batch-kwargs", batch_kwargs],
        catch_exceptions=False,
    )
    stdout = result.output
    assert result.exit_code == 1
    assert "To continue editing this suite" not in stdout
    assert "Please check that your batch_kwargs are able to load a batch." in stdout

    assert mock_webbrowser.call_count == 0
    assert mock_subprocess.call_count == 0

    assert_no_logging_messages_or_tracebacks(caplog, result)
Esempio n. 3
0
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing(
    mock_webbrowser, caplog, initialized_project, filesystem_csv_2
):
    project_dir = initialized_project
    ge_dir = os.path.join(project_dir, DataContext.GE_DIR)

    context = DataContext(ge_dir)
    context.add_datasource(
        "another_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
    )

    assert len(context.list_datasources()) == 2

    runner = CliRunner(mix_stderr=False)
    with pytest.warns(
        UserWarning, match="Warning. An existing `great_expectations.yml` was found"
    ):
        result = runner.invoke(
            cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False,
        )
    stdout = result.stdout

    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 0
    assert "Error: invalid input" not in stdout

    assert "Always know what to expect from your data" in stdout
    assert "This looks like an existing project that" in stdout
    assert "appears complete" in stdout
    assert "Would you like to build & view this project's Data Docs" in stdout

    assert_no_logging_messages_or_tracebacks(caplog, result)
def test_get_batch_kwargs_for_specific_dataasset(empty_data_context, filesystem_csv):
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)
    base_directory = str(filesystem_csv)

    context.add_datasource(
        "wow_a_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": base_directory,
            }
        },
    )

    batch = get_batch_kwargs(
        context,
        datasource_name=None,
        batch_kwargs_generator_name=None,
        data_asset_name="f1",
        additional_batch_kwargs={},
    )

    expected_batch = {
        "data_asset_name": "f1",
        "datasource": "wow_a_datasource",
        "path": os.path.join(filesystem_csv, "f1.csv"),
    }
    assert batch == expected_batch
Esempio n. 5
0
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet,
                          ge_context: DataContext) -> str:
    from great_expectations.datasource import SparkDFDatasource

    path = str(dataset._filepath.parent)

    if path.startswith("./"):
        path = path[2:]

    configuration = SparkDFDatasource.build_configuration(
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": os.path.join("..", path),
            }
        })

    configuration["class_name"] = "SparkDFDatasource"
    errors = DatasourceConfigSchema().validate(configuration)
    if len(errors) != 0:
        raise ge_exceptions.GreatExpectationsError(
            "Invalid Datasource configuration: {0:s}".format(errors))

    ge_context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
def test_cli_datasorce_list(caplog, empty_data_context, filesystem_csv_2):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)

    stdout = result.output.strip()
    assert "[]" in stdout
    assert context.list_datasources() == []

    context.add_datasource(
        "wow_a_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(filesystem_csv_2),
            }
        },
    )

    assert context.list_datasources() == [{
        "name": "wow_a_datasource",
        "class_name": "PandasDatasource"
    }]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir],
                           catch_exceptions=False)

    stdout = result.output.strip()
    if PY2:
        # deal with legacy python dictionary sorting
        assert ("'name': 'wow_a_datasource'"
                and "'class_name': 'PandasDatasource'" in stdout)
        assert len(stdout) >= 60 and len(stdout) <= 70
    else:
        assert (
            "[{'name': 'wow_a_datasource', 'class_name': 'PandasDatasource'}]"
            in stdout)
    assert_no_logging_messages_or_tracebacks(caplog, result)
Esempio n. 7
0
def test_cli_datasource_list(caplog, empty_data_context, filesystem_csv_2):
    """Test an empty project and after adding a single datasource."""
    project_root_dir = empty_data_context.root_directory
    context = DataContext(project_root_dir)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False
    )

    stdout = result.output.strip()
    assert "No Datasources found" in stdout
    assert context.list_datasources() == []

    base_directory = str(filesystem_csv_2)

    context.add_datasource(
        "wow_a_datasource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": base_directory,
            }
        },
    )

    datasources = context.list_datasources()

    assert datasources == [
        {
            "name": "wow_a_datasource",
            "class_name": "PandasDatasource",
            "data_asset_type": {"class_name": "PandasDataset",
                                "module_name": "great_expectations.dataset"},
            "batch_kwargs_generators": {"subdir_reader": {
                "base_directory": base_directory,
                "class_name": "SubdirReaderBatchKwargsGenerator"}},
            "module_name": "great_expectations.datasource",
        }
    ]

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False
    )
    expected_output = """
1 Datasource found:

 - name: wow_a_datasource
   module_name: great_expectations.datasource
   class_name: PandasDatasource
   batch_kwargs_generators:
     subdir_reader:
       class_name: SubdirReaderBatchKwargsGenerator
       base_directory: {}
   data_asset_type:
     module_name: great_expectations.dataset
     class_name: PandasDataset""".format(base_directory).strip()
    stdout = result.output.strip()
    assert stdout == expected_output
    assert_no_logging_messages_or_tracebacks(caplog, result)