def test_clip_dask_mask(geodf_points): # noqa: F811 dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) mask = dask_geopandas.from_geopandas(geodf_points.iloc[:1], npartitions=1) with pytest.raises( NotImplementedError, match=r"Mask cannot be a Dask GeoDataFrame or GeoSeries." ): dask_geopandas.clip(dask_obj, mask)
def test_operator_methods(geoseries_polygons, geoseries_points, meth): one = geoseries_polygons other = geoseries_points original = getattr(one, meth)(other) dask_one = dask_geopandas.from_geopandas(one, npartitions=2) dask_other = dask_geopandas.from_geopandas(other, npartitions=2) daskified = getattr(dask_one, meth)(dask_other) assert isinstance(daskified, dd.Series) assert all(original == daskified.compute())
def test_geom_equals_exact(geoseries_polygons, geoseries_points): meth = "geom_equals_exact" one = geoseries_polygons other = geoseries_points original = getattr(one, meth)(other, tolerance=2) dask_one = dask_geopandas.from_geopandas(one, npartitions=2) dask_other = dask_geopandas.from_geopandas(other, npartitions=2) daskified = getattr(dask_one, meth)(dask_other, tolerance=2) assert isinstance(daskified, dd.Series) assert original.equals(daskified.compute())
def test_sjoin_dask_geopandas(): df_points = geopandas.read_file( geopandas.datasets.get_path("naturalearth_cities")) ddf_points = dask_geopandas.from_geopandas(df_points, npartitions=4) df_polygons = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf_polygons = dask_geopandas.from_geopandas(df_polygons, npartitions=4) expected = geopandas.sjoin(df_points, df_polygons, predicate="within", how="inner") expected = expected.sort_index() # dask / geopandas result = dask_geopandas.sjoin(ddf_points, df_polygons, predicate="within", how="inner") assert_geodataframe_equal(expected, result.compute().sort_index()) # geopandas / dask result = dask_geopandas.sjoin(df_points, ddf_polygons, predicate="within", how="inner") assert_geodataframe_equal(expected, result.compute().sort_index()) # dask / dask result = dask_geopandas.sjoin(ddf_points, ddf_polygons, predicate="within", how="inner") assert_geodataframe_equal(expected, result.compute().sort_index()) # with spatial_partitions ddf_points.calculate_spatial_partitions() ddf_polygons.calculate_spatial_partitions() result = dask_geopandas.sjoin(ddf_points, ddf_polygons, predicate="within", how="inner") assert result.spatial_partitions is not None assert_geodataframe_equal(expected, result.compute().sort_index()) # check warning with pytest.warns(FutureWarning, match="The `op` parameter is deprecated"): dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner")
def test_geoseries_apply(geoseries_polygons): # https://github.com/jsignell/dask-geopandas/issues/18 ds = dask_geopandas.from_geopandas(geoseries_polygons, npartitions=2) result = ds.apply(lambda geom: geom.area, meta=pd.Series(dtype=float)).compute() expected = geoseries_polygons.area pd.testing.assert_series_equal(result, expected)
def test_set_crs_sets_spatial_partition_crs(geodf_points): dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) dask_obj.calculate_spatial_partitions() dask_obj = dask_obj.set_crs("epsg:4326") assert dask_obj.crs == dask_obj.spatial_partitions.crs
def test_parquet_roundtrip(tmp_path): # basic roundtrip df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf = dask_geopandas.from_geopandas(df, npartitions=4) basedir = tmp_path / "dataset" ddf.to_parquet(basedir) # each partition (4) is written as parquet file paths = list(basedir.glob("*.parquet")) assert len(paths) == 4 # reading back gives identical GeoDataFrame result = dask_geopandas.read_parquet(basedir) assert ddf.npartitions == 4 assert_geodataframe_equal(result.compute(), df) # the written dataset is also readable by plain geopandas result_gpd = geopandas.read_parquet(basedir) # the dataset written by dask has "__null_dask_index__" index column name result_gpd.index.name = None assert_geodataframe_equal(result_gpd, df) result_part0 = geopandas.read_parquet(basedir / "part.0.parquet") result_part0.index.name = None assert_geodataframe_equal(result_part0, df.iloc[:45])
def test_geohash_range(geoseries_points): ddf = from_geopandas(geoseries_points, npartitions=1) with pytest.raises(ValueError): ddf.geohash(precision=0, as_string=False) ddf.geohash(precision=12, as_string=False)
def test_roundtrip_geometry_column_name(tmp_path): # basic roundtrip with different geometry column name df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) df = df.rename_geometry("geom") # geopandas -> dask-geopandas roundtrip path = tmp_path / "data.parquet" df.to_parquet(path) result = dask_geopandas.read_parquet(path) assert isinstance(result, dask_geopandas.GeoDataFrame) assert result.geometry.name == "geom" assert result.crs == df.crs assert result.spatial_partitions is not None assert_geodataframe_equal(result.compute(), df) # dask-geopandas -> dask-geopandas roundtrip ddf = dask_geopandas.from_geopandas(df, npartitions=4) assert ddf.geometry.name == "geom" basedir = tmp_path / "dataset" ddf.to_parquet(basedir) result = dask_geopandas.read_parquet(basedir) assert isinstance(result, dask_geopandas.GeoDataFrame) assert result.geometry.name == "geom" assert result.crs == df.crs assert result.spatial_partitions is not None assert_geodataframe_equal(result.compute(), df)
def test_geoseries_crs(geoseries_points_crs): s = geoseries_points_crs original = s.crs name = s.name dask_obj = dask_geopandas.from_geopandas(s, npartitions=2) assert dask_obj.crs == original assert dask_obj.partitions[1].crs == original assert dask_obj.compute().crs == original new_crs = "epsg:4316" with pytest.raises( ValueError, match=r".*already has a CRS which is not equal to the passed CRS.*" ): dask_obj.set_crs(new_crs) new = dask_obj.set_crs(new_crs, allow_override=True) assert new.crs == new_crs assert new.name == name assert new.partitions[1].crs == new_crs assert dask_obj.crs == original dask_obj.crs = new_crs assert dask_obj.crs == new_crs assert dask_obj.partitions[1].crs == new_crs assert dask_obj.name == name assert dask_obj.compute().crs == new_crs
def test_parquet_partition_on(tmp_path, write_metadata_file): df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf = dask_geopandas.from_geopandas(df, npartitions=4) # Writing a partitioned dataset based on one of the attribute columns basedir = tmp_path / "naturalearth_lowres_by_continent.parquet" ddf.to_parquet(basedir, partition_on="continent", write_metadata_file=write_metadata_file) # Check for one of the partitions that the file is present and is correct n_files = 10 if write_metadata_file else 8 # 8 continents + 2 metadata files assert len(list(basedir.iterdir())) == n_files assert (basedir / "continent=Africa").exists() result_africa = geopandas.read_parquet(basedir / "continent=Africa") expected = df[df["continent"] == "Africa"].drop(columns=["continent"]) result_africa.index.name = None assert_geodataframe_equal(result_africa, expected) # Check roundtrip result = dask_geopandas.read_parquet(basedir) assert result.npartitions >= 8 assert result.spatial_partitions is not None expected = df.copy() expected["continent"] = expected["continent"].astype("category") assert_geodataframe_equal(result.compute(), expected, check_like=True)
def test_clip_no_spatial_partitions(geodf_points): # noqa: F811 dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) mask = geodf_points.iloc[:1] mask["geometry"] = mask["geometry"].buffer(2) expected = geodf_points.iloc[:2] result = dask_geopandas.clip(dask_obj, mask).compute() assert_geodataframe_equal(expected, result)
def test_to_wkb_series(geoseries_points, hex): s = geoseries_points dask_obj = dask_geopandas.from_geopandas(s, npartitions=4) expected = s.to_wkb(hex=hex) result = dask_obj.to_wkb(hex=hex).compute() assert_series_equal(expected, result)
def test_total_bounds_from_partitions(geoseries_polygons): ddf = from_geopandas(geoseries_polygons, npartitions=2) expected = ddf.morton_distance().compute() ddf.calculate_spatial_partitions() result = ddf.morton_distance().compute() assert_series_equal(result, expected)
def test_empty(geoseries_polygons, empty): s = geoseries_polygons s.iloc[-1] = empty dask_obj = from_geopandas(s, npartitions=2) with pytest.raises(ValueError, match="cannot be computed on a GeoSeries with empty"): dask_obj.morton_distance().compute()
def test_geoseries_unary_union(geoseries_points): original = getattr(geoseries_points, "unary_union") dask_obj = dask_geopandas.from_geopandas(geoseries_points, npartitions=2) daskified = dask_obj.unary_union assert isinstance(daskified, Scalar) assert original.equals(daskified.compute())
def test_set_geometry_property_on_geodf(geodf_points): df = geodf_points dask_obj = dask_geopandas.from_geopandas(df, npartitions=2) df = dask_obj.rename(columns={"geometry": "foo"}).set_geometry("foo").compute() assert set(df.columns) == {"value1", "value2", "foo"} assert all(df.geometry == df.foo)
def test_set_index_preserves_class(geodf_points, shuffle_method): dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) dask_obj = dask_obj.set_index("value1", shuffle=shuffle_method) for partition in dask_obj.partitions: assert isinstance(partition.compute(), geopandas.GeoDataFrame) assert isinstance(dask_obj.compute(), geopandas.GeoDataFrame)
def test_clip_geoseries(geodf_points): # noqa: F811 dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) dask_obj.calculate_spatial_partitions() mask = geodf_points.iloc[:1] mask["geometry"] = mask["geometry"].buffer(2) expected = geopandas.clip(geodf_points.geometry, mask) result = dask_geopandas.clip(dask_obj.geometry, mask).compute() assert_geoseries_equal(expected, result)
def test_to_crs_geodf(geodf_points_crs): df = geodf_points_crs dask_obj = dask_geopandas.from_geopandas(df, npartitions=2) new_crs = "epsg:4316" new = dask_obj.to_crs(new_crs) assert new.crs == new_crs assert all(new.compute() == df.to_crs(new_crs))
def test_to_wkb(geodf_points_crs, hex): df = geodf_points_crs df["polygons"] = df.buffer(1) ddf = dask_geopandas.from_geopandas(df, npartitions=4) expected = df.to_wkb(hex=hex) result = ddf.to_wkb(hex=hex).compute() assert_frame_equal(expected, result)
def test_to_crs_geoseries(geoseries_points_crs): s = geoseries_points_crs dask_obj = dask_geopandas.from_geopandas(s, npartitions=2) new_crs = "epsg:4316" new = dask_obj.to_crs(new_crs) assert new.crs == new_crs assert all(new.compute() == s.to_crs(new_crs))
def test_propagate_on_set_crs(geodf_points): dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) dask_obj.calculate_spatial_partitions() result = dask_obj.set_crs("epsg:4326").spatial_partitions expected = dask_obj.spatial_partitions.set_crs("epsg:4326") assert_geoseries_equal(result, expected)
def test_geoseries_properties(geoseries_polygons, attr): original = getattr(geoseries_polygons, attr) dask_obj = dask_geopandas.from_geopandas(geoseries_polygons, npartitions=2) assert len(dask_obj.partitions[0]) < len(geoseries_polygons) assert isinstance(dask_obj, dask_geopandas.GeoSeries) daskified = getattr(dask_obj, attr) assert all(original == daskified.compute())
def test_explode_geoseries(): s = geopandas.GeoSeries( [MultiPoint([(0, 0), (1, 1)]), MultiPoint([(2, 2), (3, 3), (4, 4)])]) original = s.explode() dask_s = dask_geopandas.from_geopandas(s, npartitions=2) daskified = dask_s.explode() assert isinstance(daskified, dask_geopandas.GeoSeries) assert all(original == daskified.compute())
def test_meth_with_args_and_kwargs(geoseries_lines, meth, options): s = geoseries_lines original = getattr(s, meth)(**options) dask_s = dask_geopandas.from_geopandas(s, npartitions=2) daskified = getattr(dask_s, meth)(**options) assert isinstance(daskified, dask_geopandas.GeoSeries) assert all(original == daskified.compute())
def test_split_out_name(self): gpd_default = self.world.rename_geometry("geom").dissolve("continent") ddf = dask_geopandas.from_geopandas(self.world.rename_geometry("geom"), npartitions=4) dd_split = ddf.dissolve("continent", split_out=4) assert dd_split.npartitions == 4 assert_geodataframe_equal(gpd_default, dd_split.compute(), check_like=True)
def test_explode_geodf(): s = geopandas.GeoSeries( [MultiPoint([(0, 0), (1, 1)]), MultiPoint([(2, 2), (3, 3), (4, 4)])]) df = geopandas.GeoDataFrame({"col": [1, 2], "geometry": s}) original = df.explode() dask_s = dask_geopandas.from_geopandas(df, npartitions=2) daskified = dask_s.explode() assert isinstance(daskified, dask_geopandas.GeoDataFrame) assert all(original == daskified.compute())
def test_spatial_partitions_setter(geodf_points): dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) # needs to be a GeoSeries with pytest.raises(TypeError): dask_obj.spatial_partitions = geodf_points # wrong length with pytest.raises(ValueError): dask_obj.spatial_partitions = geodf_points.geometry
def test_get_coord(coord): p1 = Point(1, 2, 3) p2 = Point(2, 3, 4) p3 = Point(3, 4, 5) p4 = Point(4, 1, 7) s = geopandas.GeoSeries([p1, p2, p3, p4]) dask_obj = dask_geopandas.from_geopandas(s, npartitions=2) expected = getattr(s, coord) result = getattr(dask_obj, coord).compute() assert_series_equal(expected, result)