def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {"default": ["test"]}
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path), "test.csv"))

    # Get the default (subdir_path) generator
    generator = datasource.get_generator()
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    # Note that we are using get_data_asset NOT get_batch here, since we are standalone (no batch concept)
    dataset = datasource.get_data_asset("test",
                                        generator_name="default",
                                        batch_kwargs=auto_batch_kwargs,
                                        sep=",",
                                        header=0,
                                        index_col=0)
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()

    ## A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
Exemple #2
0
def test_read_limit(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)
    dataset = datasource.get_data_asset(
        "test",
        generator_name="default",
        batch_kwargs=PathBatchKwargs({
            "path":
            os.path.join(str(test_folder_connection_path), "test.csv"),
            "limit":
            1
        }),
        reader_options={
            'sep': ",",
            'header': 0,
            'index_col': 0
        })
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_invalid_reader_pandas_datasource(tmp_path_factory):
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource"))
    datasource = PandasDatasource('mypandassource', base_directory=basepath)

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_data_asset(
            "idonotlooklikeacsvbutiam.notrecognized",
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            })
        assert "Unable to determine reader for path" in exc.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_data_asset(
            "idonotlooklikeacsvbutiam.notrecognized",
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            },
            reader_method="blarg")
        assert "Unknown reader method: blarg" in exc.message

    dataset = datasource.get_data_asset(
        "idonotlooklikeacsvbutiam.notrecognized",
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        },
        reader_method="csv",
        header=0)
    assert dataset["a"][0] == 1