コード例 #1
0
def test_missing_metadata(tmp_path):
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    path = tmp_path / "test.feather"

    # convert to DataFrame with wkb -> writing to feather will have only pandas metadata
    df = df.to_wkb()
    df.to_feather(path)

    with pytest.raises(ValueError, match="Missing geo metadata"):
        dask_geopandas.read_feather(path)

    # remove metadata completely
    from pyarrow import feather

    table = feather.read_table(path)
    feather.write_feather(table.replace_schema_metadata(), path)

    with pytest.raises(ValueError, match="Missing geo metadata"):
        dask_geopandas.read_feather(path)
コード例 #2
0
def test_write_delayed(tmp_path):
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    basedir = tmp_path / "dataset"
    dataset = ddf.to_feather(basedir, compute=False)
    dataset.compute()
    result = dask_geopandas.read_feather(basedir)
    assert result.npartitions == 4
    # TODO this reset_index should not be necessary
    result_gpd = result.compute().reset_index(drop=True)
    assert_geodataframe_equal(result_gpd, df)
コード例 #3
0
def test_index(tmp_path):
    # set up dataset
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    # get meaningful index by shuffling (hilbert distance)
    df = dask_geopandas.from_geopandas(
        df, npartitions=2).spatial_shuffle().compute()
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    # roundtrip preserves the index by default
    basedir = tmp_path / "dataset"
    ddf.to_feather(basedir)
    result = dask_geopandas.read_feather(basedir)
    assert "hilbert_distance" not in result.columns
    assert result.index.name == "hilbert_distance"
    assert_index_equal(result.index.compute(), df.index)

    # TODO not setting the index
    with pytest.raises(NotImplementedError):
        result = dask_geopandas.read_feather(basedir, index=False)
    # assert "hilbert_distance" in result.columns
    # assert result.index.name is None

    # setting specific columns as the index
    result = dask_geopandas.read_feather(basedir, index="iso_a3")
    assert "iso_a3" not in result.columns
    assert result.index.name == "iso_a3"
    assert_geodataframe_equal(result.compute(), df.set_index("iso_a3"))

    # not writing the index
    basedir = tmp_path / "dataset"
    ddf.to_feather(basedir, write_index=False)
    result = dask_geopandas.read_feather(basedir)
    assert "hilbert_distance" not in result.columns
    assert result.index.name is None
    assert result.index.compute()[0] == 0
コード例 #4
0
def test_filters(tmp_path, filter):
    # set up dataset
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)
    basedir = tmp_path / "dataset"
    ddf.to_feather(basedir)

    # specifying filters argument
    result = dask_geopandas.read_feather(basedir, filters=filter)
    assert result.npartitions == 4

    result_gpd = result.compute().reset_index(drop=True)
    expected = df[df["continent"] == "Africa"].reset_index(drop=True)
    assert_geodataframe_equal(result_gpd, expected)
コード例 #5
0
def test_read(tmp_path):
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))

    # writing a partitioned dataset with geopandas (to not rely on roundtrip)
    basedir = tmp_path / "dataset"
    basedir.mkdir()
    df.iloc[:100].to_feather(basedir / "data.0.feather")
    df.iloc[100:].to_feather(basedir / "data.1.feather")

    result = dask_geopandas.read_feather(basedir)
    assert isinstance(result, dask_geopandas.GeoDataFrame)
    assert result.npartitions == 2
    assert result.crs == df.crs
    assert result.spatial_partitions is not None
    # TODO this reset_index should not be necessary
    result_gpd = result.compute().reset_index(drop=True)
    assert_geodataframe_equal(result_gpd, df)
コード例 #6
0
def test_roundtrip_s3(s3_resource, s3_storage_options):
    fs, endpoint_url = s3_resource

    # basic roundtrip to S3
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    uri = "s3://geopandas-test/dataset.feather"
    ddf.to_feather(uri, storage_options=s3_storage_options)

    # reading back gives identical GeoDataFrame
    result = dask_geopandas.read_feather(uri,
                                         storage_options=s3_storage_options)
    assert result.npartitions == 4
    assert_geodataframe_equal(result.compute().reset_index(drop=True), df)
    # reading back correctly sets the CRS in meta
    assert result.crs == df.crs
    # reading back also populates the spatial partitioning property
    assert result.spatial_partitions is not None
コード例 #7
0
def test_roundtrip(tmp_path):
    # basic roundtrip
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)

    basedir = tmp_path / "dataset"
    ddf.to_feather(basedir)

    # reading back gives identical GeoDataFrame
    result = dask_geopandas.read_feather(basedir)
    assert result.npartitions == 4
    assert result.crs == df.crs
    # TODO this reset_index should not be necessary
    result_gpd = result.compute().reset_index(drop=True)
    assert_geodataframe_equal(result_gpd, df)
    # reading back also populates the spatial partitioning property
    ddf.calculate_spatial_partitions()
    assert_geoseries_equal(result.spatial_partitions,
                           ddf.spatial_partitions.envelope)
コード例 #8
0
def test_column_selection_push_down(tmp_path):
    # set up dataset
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    ddf = dask_geopandas.from_geopandas(df, npartitions=4)
    basedir = tmp_path / "dataset"
    # TODO awaiting a `to_feather` implementation
    # ddf.to_feather(basedir)
    basedir.mkdir()
    for i, part in enumerate(ddf.partitions):
        part.compute().to_feather(basedir / f"part.{i}.feather")

    ddf = dask_geopandas.read_feather(basedir)

    # selecting columns including geometry column still gives GeoDataFrame
    ddf_subset = ddf[["pop_est", "geometry"]]
    assert type(ddf_subset) is dask_geopandas.GeoDataFrame
    # and also preserves the spatial partitioning information
    assert ddf_subset.spatial_partitions is not None

    # selecting a single non-geometry column on the dataframe should work
    s = ddf["pop_est"]
    assert type(s) is dd.Series
    assert s.max().compute() == df["pop_est"].max()