def test_read_limit(test_folder_connection_path_csv):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    batch_kwargs = PathBatchKwargs({
        "path":
        os.path.join(str(test_folder_connection_path_csv), "test.csv"),
        # "reader_options": {"sep": ",", "header": 0, "index_col": 0},
        "reader_options": {
            "sep": ","
        },
    })
    nested_update(batch_kwargs, datasource.process_batch_parameters(limit=1))

    batch = datasource.get_batch(batch_kwargs=batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)
def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {"default": ["test"]}
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path), "test.csv"))

    # Get the default (subdir_path) generator
    generator = datasource.get_generator()
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    # Note that we are using get_data_asset NOT get_batch here, since we are standalone (no batch concept)
    dataset = datasource.get_data_asset("test",
                                        generator_name="default",
                                        batch_kwargs=auto_batch_kwargs,
                                        sep=",",
                                        header=0,
                                        index_col=0)
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()

    ## A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_process_batch_parameters():
    batch_kwargs = PandasDatasource("test").process_batch_parameters(limit=1)
    assert batch_kwargs == {"reader_options": {"nrows": 1}}

    batch_kwargs = PandasDatasource("test").process_batch_parameters(
        dataset_options={"caching": False})
    assert batch_kwargs == {"dataset_options": {"caching": False}}
Beispiel #4
0
def test_read_limit(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)
    dataset = datasource.get_data_asset(
        "test",
        generator_name="default",
        batch_kwargs=PathBatchKwargs({
            "path":
            os.path.join(str(test_folder_connection_path), "test.csv"),
            "limit":
            1
        }),
        reader_options={
            'sep': ",",
            'header': 0,
            'index_col': 0
        })
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
Beispiel #5
0
def _add_pandas_datasource(datasource_name: str, dataset: AbstractDataSet,
                           ge_context: DataContext) -> str:
    from great_expectations.datasource import PandasDatasource

    path = str(dataset._filepath.parent)

    if path.startswith("./"):
        path = path[2:]

    configuration = PandasDatasource.build_configuration(
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": os.path.join("..", path),
            }
        })

    configuration["class_name"] = "PandasDatasource"
    errors = DatasourceConfigSchema().validate(configuration)
    if len(errors) != 0:
        raise ge_exceptions.GreatExpectationsError(
            "Invalid Datasource configuration: {0:s}".format(errors))

    ge_context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
Beispiel #6
0
def _add_pandas_datasource(context):
    path = click.prompt(
        msg_prompt_filesys_enter_base_path,
        # default='/data/',
        type=click.Path(exists=True,
                        file_okay=False,
                        dir_okay=True,
                        readable=True),
        show_default=True)
    if path.startswith("./"):
        path = path[2:]

    if path.endswith("/"):
        basenamepath = path[:-1]
    else:
        basenamepath = path

    default_data_source_name = os.path.basename(basenamepath) + "__dir"
    data_source_name = click.prompt(msg_prompt_datasource_name,
                                    default=default_data_source_name,
                                    show_default=True)

    configuration = PandasDatasource.build_configuration(
        base_directory=os.path.join("..", path))
    context.add_datasource(name=data_source_name,
                           class_name='PandasDatasource',
                           **configuration)
    return data_source_name
def test_pandas_datasource_processes_dataset_options(
        test_folder_connection_path):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path,
            }
        },
    )

    assert datasource.get_available_data_asset_names() == {
        "subdir_reader": {
            "names": [("test", "file")],
            "is_complete_list": True
        }
    }
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path), "test.csv"))

    generator = datasource.get_batch_kwargs_generator("subdir_reader")
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    auto_batch_kwargs.update(
        {"reader_options": {
            "sep": ",",
            "header": 0,
            "index_col": 0
        }})
    batch = datasource.get_batch(batch_kwargs=auto_batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()
    assert len(dataset) == 5

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)
def test_invalid_reader_pandas_datasource(tmp_path_factory):
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource"))
    datasource = PandasDatasource(
        "mypandassource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": basepath,
            }
        },
    )

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            })
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "blarg",
            })
        assert "Unknown reader method: blarg" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method":
            "read_csv",
            "reader_options": {
                "header": 0
            },
        })
    assert batch.data["a"][0] == 1
def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {"default": {"test"}}
    manual_batch_kwargs = datasource.build_batch_kwargs(os.path.join(str(test_folder_connection_path), "test.csv"))

    # Get the default (subdir_path) generator
    generator = datasource.get_generator()
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    dataset = datasource.get_batch("test", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0)
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()
def test_invalid_reader_pandas_datasource(tmp_path_factory):
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource"))
    datasource = PandasDatasource('mypandassource', base_directory=basepath)

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized",
                             expectation_suite_name="default",
                             batch_kwargs={
                                 "path":
                                 os.path.join(
                                     basepath,
                                     "idonotlooklikeacsvbutiam.notrecognized")
                             })
        assert "Unable to determine reader for path" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized",
                             expectation_suite_name="default",
                             batch_kwargs={
                                 "path":
                                 os.path.join(
                                     basepath,
                                     "idonotlooklikeacsvbutiam.notrecognized")
                             },
                             reader_method="blarg")
        assert "Unknown reader method: blarg" in exc.message

    dataset = datasource.get_batch(
        "idonotlooklikeacsvbutiam.notrecognized",
        expectation_suite_name="default",
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        },
        reader_method="csv",
        header=0)
    assert dataset["a"][0] == 1
def test_infer_default_options_partial_functions(reader_fn):
    datasource = PandasDatasource()
    reader_fn_partial = partial(reader_fn)
    assert datasource._infer_default_options(
        reader_fn_partial,
        {}) == datasource._infer_default_options(reader_fn, {})
def basic_pandas_datasource():
    return PandasDatasource("basic_pandas_datasource")
def test_process_batch_parameters():
    batch_kwargs = PandasDatasource("test").process_batch_parameters(limit=1)
    assert batch_kwargs == {"reader_options": {"nrows": 1}}
Beispiel #15
0
def _add_pandas_datasource(
    context, passthrough_generator_only=True, prompt_for_datasource_name=True
):
    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "pandas"},
        success=True,
    )

    if passthrough_generator_only:
        datasource_name = "files_datasource"
        configuration = PandasDatasource.build_configuration()

    else:
        path = click.prompt(
            msg_prompt_filesys_enter_base_path,
            type=click.Path(exists=True, file_okay=False),
        )

        if path.startswith("./"):
            path = path[2:]

        if path.endswith("/"):
            basenamepath = path[:-1]
        else:
            basenamepath = path

        datasource_name = os.path.basename(basenamepath) + "__dir"
        if prompt_for_datasource_name:
            datasource_name = click.prompt(
                msg_prompt_datasource_name, default=datasource_name
            )

        configuration = PandasDatasource.build_configuration(
            batch_kwargs_generators={
                "subdir_reader": {
                    "class_name": "SubdirReaderBatchKwargsGenerator",
                    "base_directory": os.path.join("..", path),
                }
            }
        )

        configuration["class_name"] = "PandasDatasource"
        configuration["module_name"] = "great_expectations.datasource"
        errors = DatasourceConfigSchema().validate(configuration)
        if len(errors) != 0:
            raise ge_exceptions.GreatExpectationsError(
                "Invalid Datasource configuration: {:s}".format(errors)
            )

    cli_message(
        """
Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml:

{:s}
""".format(
            datasource_name,
            textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), "  "),
        )
    )

    toolkit.confirm_proceed_or_exit(
        continuation_message="Okay, exiting now. To learn more about adding datasources, run great_expectations "
        "datasource --help or visit https://docs.greatexpectations.io/"
    )

    context.add_datasource(name=datasource_name, **configuration)
    return datasource_name