Example #1
0
def _intersects_pyg(geom, gdf, sindex, tolerance=1e-9):
    buf = pygeos.buffer(geom, tolerance)
    if pygeos.is_empty(buf):
        # can have an empty buffer with too small a tolerance, fallback to original geom
        buf = geom
    try:
        return _intersects_gdf_pyg(buf, gdf, sindex)
    except shapely.errors.TopologicalError:  #this still needs to be changed
        # can exceptionally buffer to an invalid geometry, so try re-buffering
        buf = pygeos.buffer(geom, 0)
        return _intersects_gdf_pyg(buf, gdf, sindex)
Example #2
0
    def time_tree_nearest_all_poly_python(self):
        # returns all input points

        # use an arbitrary search tolerance that seems appropriate for the density of
        # geometries
        tolerance = 200
        b = pygeos.buffer(self.points, tolerance, quadsegs=1)
        left, right = self.tree.query_bulk(b)
        dist = pygeos.distance(self.points.take(left),
                               self.polygons.take(right))

        # sort by left, distance
        ix = np.lexsort((right, dist, left))
        left = left[ix]
        right = right[ix]
        dist = dist[ix]

        run_start = np.r_[True, left[:-1] != left[1:]]
        run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

        mins = dist[run_start]

        # spread to rest of array so we can extract out all within each group that match
        all_mins = np.repeat(mins, run_counts)
        ix = dist == all_mins
        left = left[ix]
        right = right[ix]
        dist = dist[ix]
Example #3
0
def export_duplicate_areas(dups, path):
    """Export duplicate barriers for QA.

    Parameters
    ----------
    dups : GeoDataFrame
        contains "geometry" and "dup_group"
        to indicate group
    path : str or Path
        output path
    """

    print("Exporting duplicate areas")

    dups = dups.copy()
    dups["geometry"] = pg.buffer(dups.geometry.values.data, dups.dup_tolerance)
    dissolved = dissolve(dups[["geometry", "dup_group"]], by="dup_group")
    groups = gp.GeoDataFrame(
        dups[["id", "SARPID", "dup_group"]]
        .groupby("dup_group")
        .agg({"SARPID": "unique", "id": "unique"})
        .join(dissolved.geometry, on="dup_group"),
        crs=dups.crs,
    )
    groups["id"] = groups.id.apply(lambda x: ", ".join([str(s) for s in x]))
    groups["SARPID"] = groups.SARPID.apply(lambda x: ", ".join([str(s) for s in x]))
    write_dataframe(groups, path)
Example #4
0
def nearest(geom, gdf,sindex, tolerance):
    """Finds the nearest node

    Args:
        geom (pygeos.Geometry) : Geometry to find nearest
        gdf (pandas.index): Node dataframe to provide possible nodes
        sindex (pygeos.Sindex): Spatial index for faster lookup
        tolerance (float): Size of buffer to use to find nodes

    Returns:
        nearest_geom.id [int]: The node id that is closest to the geom
    """    
    matches_idx = sindex.query(geom)
    if not matches_idx.any():
        buf = pyg.buffer(geom, tolerance)
        matches_idx = sindex.query(buf,'contains').tolist()
    try:
        nearest_geom = min(
            [gdf.iloc[match_idx] for match_idx in matches_idx],
            key=lambda match: pyg.measurement.distance(match.geometry,geom)
        )
    except: 
        #print("Couldn't find node")
        return -1
    return nearest_geom.id
Example #5
0
def poly_tree():
    # create buffers so that midpoint between two buffers intersects
    # each buffer.  NOTE: add EPS to help mitigate rounding errors at midpoint.
    geoms = pygeos.buffer(pygeos.points(np.arange(10), np.arange(10)),
                          HALF_UNIT_DIAG + EPS,
                          quadsegs=32)
    yield pygeos.STRtree(geoms)
Example #6
0
def close_gaps(df, tolerance):
    """Close gaps in LineString geometry where it should be contiguous.

    Snaps both lines to a centroid of a gap in between.

    """
    geom = df.geometry.values.data
    coords = pygeos.get_coordinates(geom)
    indices = pygeos.get_num_coordinates(geom)

    # generate a list of start and end coordinates and create point geometries
    edges = [0]
    i = 0
    for ind in indices:
        ix = i + ind
        edges.append(ix - 1)
        edges.append(ix)
        i = ix
    edges = edges[:-1]
    points = pygeos.points(np.unique(coords[edges], axis=0))

    buffered = pygeos.buffer(points, tolerance)

    dissolved = pygeos.union_all(buffered)

    exploded = [
        pygeos.get_geometry(dissolved, i)
        for i in range(pygeos.get_num_geometries(dissolved))
    ]

    centroids = pygeos.centroid(exploded)

    snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance)

    return snapped
Example #7
0
def export_duplicate_areas(dups, path):
    """Export duplicate barriers to a geopackage for QA.

    Parameters
    ----------
    dups : DataFrame
        contains pygeos geometries in "geometry" and "dup_group"
        to indicate group
    path : str or Path
        output path
    """
    dups["geometry"] = pg.buffer(dups.geometry, dups.dup_tolerance)
    dissolved = dissolve(dups[["geometry", "dup_group"]], by="dup_group")
    groups = (dups[["id", "SARPID", "dup_group"
                    ]].join(dissolved.geometry,
                            on="dup_group").groupby("dup_group").agg({
                                "geometry":
                                "first",
                                "SARPID":
                                "unique",
                                "id":
                                "unique"
                            }))
    groups["id"] = groups.id.apply(lambda x: ", ".join([str(s) for s in x]))
    groups["SARPID"] = groups.SARPID.apply(
        lambda x: ", ".join([str(s) for s in x]))
    to_gpkg(groups, path, crs=CRS)
Example #8
0
def buffer(data, distance, resolution=16, **kwargs):
    if compat.USE_PYGEOS:
        return pygeos.buffer(data, distance, quadsegs=resolution, **kwargs)
    else:
        out = np.empty(len(data), dtype=object)
        if isinstance(distance, np.ndarray):
            if len(distance) != len(data):
                raise ValueError(
                    "Length of distance sequence does not match "
                    "length of the GeoSeries"
                )

            with compat.ignore_shapely2_warnings():
                out[:] = [
                    geom.buffer(dist, resolution, **kwargs)
                    if geom is not None
                    else None
                    for geom, dist in zip(data, distance)
                ]
            return out

        with compat.ignore_shapely2_warnings():
            out[:] = [
                geom.buffer(distance, resolution, **kwargs)
                if geom is not None
                else None
                for geom in data
            ]
        return out
Example #9
0
 def setup(self):
     # create irregular polygons by merging overlapping point buffers
     self.left = pygeos.union_all(
         pygeos.buffer(pygeos.points(np.random.random((500, 2)) * 500), 15)
     )
     # shift this up and right
     self.right = pygeos.apply(self.left, lambda x: x + 50)
Example #10
0
    def filter_(self, action, wkt, **kwargs):
        """Performs a filtering predicate operation and export to a spatial file.

        Arguments:
            action (str): The filtering action, one of 'nearest', 'within', 'within_buffer'.
            wkt (str): Well-Known Text representation of the geometry.
            **kwargs: Additional keyword arguments for the filtering operation.

        Returns:
            (str): The path of the exported archive.
        """
        gdf = self._gdf
        if action == 'nearest':
            # k = kwargs.pop('k', 1)
            # maximum_distance = kwargs.pop('maximum_distance', None)
            # if maximum_distance is not None:
            #     buffer = pg.buffer(pg.from_wkt(wkt), maximum_distance)
            #     gdf = gdf[gdf.predicates.within(buffer)]
            distance = gdf.measurement.distance(wkt)
            gdf.add_column('distance', distance, dtype=float)
            gdf = gdf.sort('distance', ascending=False)
        elif action == 'within':
            gdf = gdf[gdf.predicates.within(wkt)]
        elif action == 'within_buffer':
            radius = kwargs.pop('radius', 0)
            buffer = pg.buffer(pg.from_wkt(wkt), radius)
            gdf = gdf[gdf.predicates.within(buffer)]
        else:
            raise ValueError("action could be one of 'nearest', 'within', 'within_buffer'.")
        if len(gdf) == 0:
            raise ResultedEmptyDataFrame("The resulted dataframe is empty.")
        export = os.path.join(self._working_dir, "{filename}_{action}{extension}".format(filename=self._filename, action=action, extension=self._extension))
        gdf.export(export, driver=self._driver)

        return self._compress_files(export)
Example #11
0
    def _morphological_tessellation(self,
                                    gdf,
                                    unique_id,
                                    limit,
                                    shrink,
                                    segment,
                                    verbose,
                                    check=True):
        objects = gdf

        if shrink != 0:
            print("Inward offset...") if verbose else None
            mask = objects.type.isin(["Polygon", "MultiPolygon"])
            objects.loc[mask, objects.geometry.name] = objects[mask].buffer(
                -shrink, cap_style=2, join_style=2)

        objects = objects.reset_index(drop=True).explode()
        objects = objects.set_index(unique_id)

        print("Generating input point array...") if verbose else None
        points, ids = self._dense_point_array(objects.geometry.values.data,
                                              distance=segment,
                                              index=objects.index)

        hull = pygeos.convex_hull(limit)
        bounds = pygeos.bounds(hull)
        width = bounds[2] - bounds[0]
        leng = bounds[3] - bounds[1]
        hull = pygeos.buffer(hull, 2 * width if width > leng else 2 * leng)

        hull_p, hull_ix = self._dense_point_array(
            [hull], distance=pygeos.length(hull) / 100, index=[0])
        points = np.append(points, hull_p, axis=0)
        ids = ids + ([-1] * len(hull_ix))

        print("Generating Voronoi diagram...") if verbose else None
        voronoi_diagram = Voronoi(np.array(points))

        print("Generating GeoDataFrame...") if verbose else None
        regions_gdf = self._regions(voronoi_diagram,
                                    unique_id,
                                    ids,
                                    crs=gdf.crs)

        print("Dissolving Voronoi polygons...") if verbose else None
        morphological_tessellation = regions_gdf[[unique_id, "geometry"
                                                  ]].dissolve(by=unique_id,
                                                              as_index=False)

        morphological_tessellation = gpd.clip(
            morphological_tessellation, gpd.GeoSeries(limit, crs=gdf.crs))

        if check:
            self._check_result(morphological_tessellation,
                               gdf,
                               unique_id=unique_id)

        return morphological_tessellation
Example #12
0
 def setup(self):
     # create irregular polygons by merging overlapping point buffers
     self.polygon = pygeos.union_all(
         pygeos.buffer(pygeos.points(np.random.random((1000, 2)) * 500),
                       10))
     xmin = np.random.random(100) * 100
     xmax = xmin + 100
     ymin = np.random.random(100) * 100
     ymax = ymin + 100
     self.bounds = np.array([xmin, ymin, xmax, ymax]).T
     self.boxes = pygeos.box(xmin, ymin, xmax, ymax)
Example #13
0
def close_gaps(gdf, tolerance):
    """Close gaps in LineString geometry where it should be contiguous.

    Snaps both lines to a centroid of a gap in between.

    Parameters
    ----------
    gdf : GeoDataFrame, GeoSeries
        GeoDataFrame  or GeoSeries containing LineString representation of a network.
    tolerance : float
        nodes within a tolerance will be snapped together

    Returns
    -------
    GeoSeries
    
    See also
    --------
    momepy.extend_lines
    momepy.remove_false_nodes

    """
    geom = gdf.geometry.values.data
    coords = pygeos.get_coordinates(geom)
    indices = pygeos.get_num_coordinates(geom)

    # generate a list of start and end coordinates and create point geometries
    edges = [0]
    i = 0
    for ind in indices:
        ix = i + ind
        edges.append(ix - 1)
        edges.append(ix)
        i = ix
    edges = edges[:-1]
    points = pygeos.points(np.unique(coords[edges], axis=0))

    buffered = pygeos.buffer(points, tolerance / 2)

    dissolved = pygeos.union_all(buffered)

    exploded = [
        pygeos.get_geometry(dissolved, i)
        for i in range(pygeos.get_num_geometries(dissolved))
    ]

    centroids = pygeos.centroid(exploded)

    snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance)

    return gpd.GeoSeries(snapped, crs=gdf.crs)
Example #14
0
def near(source, target, distance):
    """Return target geometries within distance of source geometries.

    Only returns records from source that intersected at least one feature in target.

    Parameters
    ----------
    source : Series
        contains pygeos geometries
    target : Series
        contains target pygeos geometries to search against
    distance : number or ndarray
        radius within which to find target geometries.
        If ndarray, must be equal length to source.

    Returns
    -------
    DataFrame
        indexed on original index of source
        includes distance
    """

    # Get all indices from target_values that intersect buffers of input geometry
    idx = sjoin_geometry(pg.buffer(source, distance), target)
    hits = (pd.DataFrame(idx).join(source.rename("geometry"),
                                   how="inner").join(
                                       target.rename("geometry_right"),
                                       on="index_right",
                                       how="inner"))
    # this changes the index if hits is empty, causing downstream problems
    if not len(hits):
        hits.index.name = idx.index.name

    hits["distance"] = pg.distance(hits.geometry,
                                   hits.geometry_right).astype("float32")

    return (hits.drop(columns=["geometry", "geometry_right"]).rename(
        columns={
            "index_right": target.index.name or "index_right"
        }).sort_values(by="distance"))
Example #15
0
def constructive(arr, operation, *args, **kwargs):
    if operation == 'boundary':
        geometries = pg.boundary(pg.from_wkb(arr), **kwargs)
    elif operation == 'buffer':
        geometries = pg.buffer(pg.from_wkb(arr), *args, **kwargs)
    elif operation == 'build_area':
        geometries = pg.build_area(pg.from_wkb(arr), **kwargs)
    elif operation == 'centroid':
        geometries = pg.centroid(pg.from_wkb(arr), **kwargs)
    elif operation == 'clip_by_rect':
        geometries = pg.clip_by_rect(pg.from_wkb(arr), *args, **kwargs)
    elif operation == 'convex_hull':
        geometries = pg.convex_hull(pg.from_wkb(arr), **kwargs)
    elif operation == 'delaunay_triangles':
        geometries = pg.delaunay_triangles(pg.from_wkb(arr), **kwargs)
    elif operation == 'envelope':
        geometries = pg.envelope(pg.from_wkb(arr), **kwargs)
    elif operation == 'extract_unique_points':
        geometries = pg.extract_unique_points(pg.from_wkb(arr), **kwargs)
    elif operation == 'make_valid':
        geometries = pg.make_valid(pg.from_wkb(arr), **kwargs)
    elif operation == 'normalize':
        geometries = pg.normalize(pg.from_wkb(arr), **kwargs)
    elif operation == 'offset_curve':
        geometries = pg.offset_curve(pg.from_wkb(arr), *args, **kwargs)
    elif operation == 'point_on_surface':
        geometries = pg.point_on_surface(pg.from_wkb(arr), **kwargs)
    elif operation == 'reverse':
        geometries = pg.reverse(pg.from_wkb(arr), **kwargs)
    elif operation == 'simplify':
        geometries = pg.simplify(pg.from_wkb(arr), *args, **kwargs)
    elif operation == 'snap':
        geometries = pg.snap(pg.from_wkb(arr), *args, **kwargs)
    elif operation == 'voronoi_polygons':
        geometries = pg.voronoi_polygons(pg.from_wkb(arr), **kwargs)
    else:
        warnings.warn(f'Operation {operation} not supported.')
        return None
    return pg.to_wkb(geometries)
Example #16
0
    def setup(self):
        # create irregular polygons my merging overlapping point buffers
        self.polygons = pygeos.get_parts(
            pygeos.union_all(
                pygeos.buffer(pygeos.points(np.random.random((2000, 2)) * 500),
                              5)))
        self.tree = pygeos.STRtree(self.polygons)
        # initialize the tree by making a tiny query first
        self.tree.query(pygeos.points(0, 0))

        # create points that extend beyond the domain of the above polygons to ensure
        # some don't overlap
        self.points = pygeos.points((np.random.random((2000, 2)) * 750) - 125)
        self.point_tree = pygeos.STRtree(
            pygeos.points(np.random.random((2000, 2)) * 750))
        self.point_tree.query(pygeos.points(0, 0))

        # create points on a grid for testing equidistant nearest neighbors
        # creates 2025 points
        grid_coords = np.mgrid[:45, :45].T.reshape(-1, 2)
        self.grid_point_tree = pygeos.STRtree(pygeos.points(grid_coords))
        self.grid_points = pygeos.points(grid_coords + 0.5)
Example #17
0
    def time_tree_nearest_points_equidistant_manual_all(self):
        # This benchmark approximates nearest_all for equidistant results
        # starting from singular nearest neighbors and searching for more
        # within same distance.

        # try to find all equidistant neighbors ourselves given single nearest
        # result
        l, r = self.grid_point_tree.nearest(self.grid_points)
        # calculate distance to nearest neighbor
        dist = pygeos.distance(
            self.grid_points.take(l), self.grid_point_tree.geometries.take(r)
        )
        # include a slight epsilon to ensure nearest are within this radius
        b = pygeos.buffer(self.grid_points, dist + 1e-8)

        # query the tree for others in the same buffer distance
        left, right = self.grid_point_tree.query_bulk(b, predicate="intersects")
        dist = pygeos.distance(
            self.grid_points.take(left), self.grid_point_tree.geometries.take(right)
        )

        # sort by left, distance
        ix = np.lexsort((right, dist, left))
        left = left[ix]
        right = right[ix]
        dist = dist[ix]

        run_start = np.r_[True, left[:-1] != left[1:]]
        run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

        mins = dist[run_start]

        # spread to rest of array so we can extract out all within each group that match
        all_mins = np.repeat(mins, run_counts)
        ix = dist == all_mins
        left = left[ix]
        right = right[ix]
        dist = dist[ix]
Example #18
0
def test_buffer_join_style_invalid():
    with pytest.raises(ValueError, match="'invalid' is not a valid option"):
        pygeos.buffer(point, 1, join_style="invalid")
Example #19
0
def test_buffer_single_sided():
    # buffer a line on one side
    line = pygeos.linestrings([[0, 0], [10, 0]])
    actual = pygeos.buffer(line, 0.1, cap_style="square", single_sided=True)
    assert pygeos.area(actual) == pytest.approx(0.1 * 10, abs=0.01)
Example #20
0
def test_buffer_square():
    # buffer a point to a square
    actual = pygeos.buffer(point, 1.0, cap_style="square")
    assert pygeos.area(actual) == pytest.approx(2 ** 2, abs=0.01)
Example #21
0
def test_buffer_default():
    # buffer a point to a circle
    radii = np.array([1.0, 2.0])
    actual = pygeos.buffer(point, radii, quadsegs=16)
    assert pygeos.area(actual) == pytest.approx(np.pi * radii ** 2, rel=0.01)
print("Reading NHD points, lines, and areas, and merging...")
nhd_pts = read_feathers(
    [raw_dir / huc2 / "nhd_points.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)
nhd_pts = nhd_pts.loc[nhd_pts.FType.isin([343])].copy()

# write original points for SARP
write_dataframe(nhd_pts, out_dir / "nhd_dam_pts_nhdpoint.fgb")

nhd_pts["source"] = "NHDPoint"


# create circular buffers to merge with others
nhd_pts["geometry"] = pg.buffer(nhd_pts.geometry.values.data, 5)

nhd_lines = read_feathers(
    [raw_dir / huc2 / "nhd_lines.feather" for huc2 in huc2s],
    geo=True,
    new_fields={"HUC2": huc2s},
)
nhd_lines = nhd_lines.loc[
    (nhd_lines.FType.isin([343, 369, 398])) & nhd_lines.geometry.notnull()
].reset_index(drop=True)
# create buffers (5m) to merge with NHD areas
# from visual inspection, this helps coalesce those that are in pairs
nhd_lines["geometry"] = pg.buffer(nhd_lines.geometry.values.data, 5, quadsegs=1)
nhd_lines["source"] = "NHDLine"

# All NHD areas indicate a dam-related feature
Example #23
0
def dissolve_waterbodies(df, joins):
    """Dissolve waterbodies that overlap, duplicate, or otherwise touch each other.

    WARNING: some adjacent waterbodies are divided by dams, etc.  These will need to be
    accounted for later when snapping dams.

    Parameters
    ----------
    df : GeoDataFrame
        waterbodies
    joins : DataFrame
        waterbody / flowline joins

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (waterbodies, waterbody joins)
    """

    ### Join waterbodies to themselves to find overlaps
    start = time()
    to_agg = pd.DataFrame(sjoin(df.geometry, df.geometry))

    # drop the self-intersections
    to_agg = to_agg.loc[to_agg.index != to_agg.index_right].copy()
    print("Found {:,} waterbodies that touch or overlap".format(
        len(to_agg.index.unique())))

    if len(to_agg):
        # Use network (mathematical, not aquatic) adjacency analysis
        # to identify all sets of waterbodies that touch.
        # Construct an identity map from all wbIDs to their newID (will be new wbID after dissolve)
        grouped = to_agg.groupby(level=0).index_right.unique()
        network = nx.from_pandas_edgelist(
            grouped.explode().reset_index().rename(columns={
                "wbID": "index",
                "index_right": "wbID"
            }),
            "index",
            "wbID",
        )

        components = pd.Series(nx.connected_components(network)).apply(list)
        groups = pd.DataFrame(components.explode().rename("wbID"))

        next_id = df.index.max() + 1
        groups["group"] = (next_id + groups.index).astype("uint32")
        groups = groups.set_index("wbID")

        # assign group to polygons to aggregate
        to_agg = (to_agg.join(groups).reset_index().drop(
            columns=["index_right"]).drop_duplicates().set_index("wbID").join(
                df[["geometry", "FType"]]))

        ### Dissolve groups
        # Buffer geometries slightly to make sure that any which intersect actually overlap
        print("Buffering {:,} unique waterbodies before dissolving...".format(
            len(to_agg)))
        buffer_start = time()
        # TODO: use pg, and simplify since this creates a large number of vertices by default
        to_agg["geometry"] = pg.simplify(
            pg.buffer(to_agg.geometry, 0.1, quadsegs=1), 0.1)
        print("Buffer completed in {:.2f}s".format(time() - buffer_start))

        print("Dissolving...")
        dissolve_start = time()

        # NOTE: automatically takes the first FType
        # dissolved = to_agg.dissolve(by="group").reset_index(drop=True)
        dissolved = dissolve(to_agg, by="group")

        errors = (pg.get_type_id(
            dissolved.geometry) == pg.GeometryType.MULTIPOLYGON.value)
        if errors.max():
            print(
                "WARNING: Dissolve created {:,} multipolygons, these will cause errors later!"
                .format(errors.sum()))

        # this may create multipolygons if polygons that are dissolved don't sufficiently share overlapping geometries.
        # for these, we want to retain them as individual polygons
        # dissolved = dissolved.explode().reset_index(drop=True)
        # WARNING: this doesn't work with our logic below for figuring out groups associated with original wbIDs
        # since after exploding, we don't know what wbID went into what group

        # assign new IDs and update fields
        next_id = df.index.max() + 1
        dissolved["wbID"] = (next_id + dissolved.index).astype("uint32")
        dissolved["AreaSqKm"] = (pg.area(dissolved.geometry) *
                                 1e-6).astype("float32")
        dissolved["NHDPlusID"] = 0
        dissolved.NHDPlusID = dissolved.NHDPlusID.astype("uint64")
        dissolved.wbID = dissolved.wbID.astype("uint32")

        print(
            "Dissolved {:,} adjacent polygons into {:,} new polygons in {:.2f}s"
            .format(len(to_agg), len(dissolved),
                    time() - dissolve_start))

        # remove waterbodies that were dissolved, and append the result
        # of the dissolve
        df = (df.loc[~df.index.isin(to_agg.index)].reset_index().append(
            dissolved, ignore_index=True, sort=False).set_index("wbID"))

        # update joins
        ix = joins.loc[joins.wbID.isin(groups.index)].index

        # NOTE: this mapping will not work if explode() is used above
        joins.loc[ix, "wbID"] = joins.loc[ix].wbID.map(groups.group)

        # Group together ones that were dissolved above
        joins = joins.drop_duplicates().reset_index(drop=True)

    print("Done resolving overlapping waterbodies in {:.2f}s".format(time() -
                                                                     start))

    return df, joins
Example #24
0
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, wb_joins, out_dir):
    """
    Cut lines by waterbodies.
    1. Intersects all previously intersected flowlines with waterbodies.
    2. For those that cross but are not completely contained by waterbodies, cut them.
    3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts.
    4. Any flowlines that are not contained or crossing waterbodies are dropped from joins

    Parameters
    ----------
    flowlines : GeoDataFrame
    joins : DataFrame
        flowline joins
    waterbodies : GeoDataFrame
    wb_joins : DataFrame
        waterbody flowline joins
    outdir : pathlib.Path
        output directory for writing error files, if needed

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame)
        (flowlines, joins, waterbodies, waterbody joins)
    """

    start = time()

    fl_geom = flowlines.loc[flowlines.index.isin(wb_joins.lineID), ["geometry"]].copy()

    # Many waterbodies have interior polygons (islands); these break the analysis below for cutting lines
    # Extract a new polygon of just their outer boundary
    wb_geom = waterbodies[["geometry"]].copy()
    wb_geom["waterbody"] = pg.polygons(pg.get_exterior_ring(wb_geom.geometry))

    print("Validating waterbodies...")
    ix = ~pg.is_valid(wb_geom.waterbody)
    invalid_count = ix.sum()
    if invalid_count:
        print("{:,} invalid waterbodies found, repairing...".format(invalid_count))

        # Buffer by 0 to fix
        # TODO: may need to do this by a small fraction and simplify instead
        repair_start = time()
        wb_geom.loc[ix, "waterbody"] = pg.buffer(wb_geom.loc[ix].waterbody, 0)
        waterbodies.loc[ix, "geometry"] = wb_geom.loc[ix].waterbody
        print("Repaired geometry in {:.2f}s".format(time() - repair_start))

    # Set indices and create combined geometry object for analysis
    wb_joins = wb_joins.set_index(["lineID", "wbID"])
    geoms = wb_joins.join(fl_geom, how="inner").join(wb_geom.waterbody)

    ### Find contained geometries
    print(
        "Identifying flowlines completely within waterbodies out of {:,} flowline / waterbody combinations...".format(
            len(geoms)
        )
    )
    contained_start = time()
    geoms["inside"] = pg.contains(geoms.waterbody.values, geoms.geometry.values)

    print(
        "Identified {:,} flowlines completely contained by waterbodies in {:.2f}s".format(
            geoms.inside.sum(), time() - contained_start
        )
    )

    # Check for logic errors - no flowline should be completely contained by more than 1 waterbody
    errors = geoms.groupby(level=[0]).inside.sum().astype("uint8") > 1
    if errors.max():
        # this most likely indicates duplicate waterbodies, which should have been resolved before this
        print(
            "ERROR: major logic error - some flowlines claim to be completely contained by multiple waterbodies"
        )
        print(
            "===> error flowlines written to {}/contained_errors.feather".format(
                out_dir
            )
        )
        to_geofeather(
            flowlines.loc[flowlines.index.isin(errors)],
            out_dir / "contained_errors.feather",
            crs=CRS,
        )

    ### Check those that aren't contained to see if they cross
    print("Determining which flowlines actually cross into waterbodies...")
    cross_start = time()
    geoms = geoms.loc[~geoms.inside].copy()
    geoms["crosses"] = pg.crosses(geoms.geometry, geoms.waterbody)

    outside = geoms.loc[~(geoms["crosses"] | geoms.inside)].index

    # keep the ones that cross for further processing
    geoms = geoms.loc[geoms.crosses].copy()

    print(
        "Identified {:,} flowlines completely outside waterbodies and {:,} flowlines that cross waterbody boundaries in {:.2f}s".format(
            len(outside), len(geoms), time() - cross_start
        )
    )

    # Any that do not cross and are not completely within waterbodies should be dropped now
    # Can only drop joins by BOTH lineID and wbID (the index here)
    # Also drop associated waterbodies that no longer have joins
    wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy()

    # FIXME: for closely adjacent waterbodies, these are important to keep
    # Need to cut them by their multiple polys, update their joins, and feed back into following analysis
    # pg.intersection_all might work here

    # check for multiple crossings - these are errors from NHD that we can drop from here
    errors = geoms.groupby(level=0).size() > 1
    if errors.max():
        print(
            "Found {:,} flowlines that cross multiple waterbodies.  These are bad data and will be dropped from waterbody intersection.".format(
                errors.sum()
            )
        )

        to_geofeather(
            flowlines.loc[errors.index].reset_index(),
            out_dir / "error_crosses_multiple.feather",
            crs=CRS,
        )

        # completely remove the flowlines from intersections and drop the waterbodies
        wb_joins = wb_joins.loc[
            ~wb_joins.index.get_level_values(0).isin(errors.loc[errors].index)
        ].copy()
        waterbodies = waterbodies.loc[
            waterbodies.index.isin(wb_joins.index.get_level_values(1))
        ].copy()
        geoms = geoms.loc[geoms.index.isin(wb_joins.index)].copy()

    print("Calculating geometric intersection of flowlines and waterbodies...")
    int_start = time()
    geoms = geoms[["geometry", "waterbody"]].join(flowlines.length.rename("origLength"))

    # First, calculate the geometric intersection between the lines and waterbodies
    # WARNING: this intersection may return LineString, MultiLineString, Point, GeometryCollection
    geoms["intersection"] = pg.intersection(geoms.geometry, geoms.waterbody)
    types = pg.get_type_id(geoms.intersection)
    # NOTE: all the points should be captured by the above logic for crosses
    is_point = types.isin([0, 4])
    is_line = types.isin([1, 5])

    others = types[~(is_point | is_line)].unique()
    # GeometryCollection indicates a mess, skip those
    if len(others):
        print(
            "WARNING: Found other types of geometric intersection: {} (n={:,}), these will be dropped".format(
                others, len(types[~(is_point | is_line)])
            )
        )

    # Any that intersect only at a point are OUTSIDE
    outside = geoms.loc[is_point].index  # TODO: confirm this works
    wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy()
    print("Identified {:,} more flowlines outside waterbodies".format(len(outside)))

    # Drop those that are not lines from further analysis
    geoms = geoms.loc[is_line].copy()

    # Inspect amount of overlay - if the intersected length is within 1m of final length, it is completely within
    # if it is near 0, it is completely outside
    geoms["length"] = pg.length(geoms.intersection)
    outside = geoms.length < 1
    inside = (geoms.origLength - geoms.length).abs() < 1

    print(
        "Found {:,} more completely outside, {:,} completely inside".format(
            outside.sum(), inside.sum()
        )
    )

    # drop the ones that are outside
    wb_joins = wb_joins.loc[~wb_joins.index.isin(outside[outside].index)].copy()

    # cut the ones that aren't completely inside or outside
    geoms = geoms.loc[~(inside | outside)].copy()

    print("Done evaluating intersection in {:.2f}s".format(time() - int_start))

    if len(geoms):
        print("Cutting {:,} flowlines ...".format(len(geoms)))
        cut_start = time()
        geoms = geoms[["geometry", "waterbody", "origLength"]]

        # WARNING: difference is not precise, the point of split is not exactly at the intersection between lines
        # but within some tolerance.  This will cause them to fail the contains() test below.
        boundary = pg.boundary(geoms.waterbody)
        geoms["geometry"] = pg.difference(geoms.geometry, boundary)

        errors = ~pg.is_valid(geoms.geometry)
        if errors.max():
            print("WARNING: geometry errors for {:,} cut lines".format(errors.sum()))

        length = pg.length(geoms.geometry)
        errors = (length - geoms.origLength).abs() > 1
        if errors.max():
            print(
                "WARNING: {:,} lines were not completely cut by waterbodies (maybe shared edge?).\nThese will not be cut".format(
                    errors.sum()
                )
            )
            to_geofeather(
                flowlines.loc[
                    errors.loc[errors].index.get_level_values(0).unique()
                ].reset_index(),
                out_dir / "error_incomplete_cut.feather",
                crs=CRS,
            )

            # remove these from the cut geoms and retain their originals
            geoms = geoms.loc[~errors].copy()

        # Explode the multilines into single line segments
        geoms["geometry"] = explode(geoms.geometry)
        geoms = geoms.explode("geometry")

        # mark those parts of the cut lines that are within waterbodies
        # WARNING: this is not capturing all that should be inside after cutting!
        geoms["iswithin"] = pg.contains(geoms.waterbody, geoms.geometry)

        errors = geoms.groupby(level=0).iswithin.max() == False
        if errors.max():
            print(
                "WARNING: {:,} flowlines that cross waterbodies had no parts contained within those waterbodies".format(
                    errors.sum()
                )
            )
            to_geofeather(
                flowlines.loc[errors.index].reset_index(),
                out_dir / "error_crosses_but_not_contained.feather",
                crs=CRS,
            )

            # If they cross, assume they are within
            print("Attempting to correct these based on which ones cross")
            ix = geoms.loc[
                geoms.index.get_level_values(0).isin(errors.loc[errors].index)
            ].index
            geoms.loc[ix, "iswithin"] = pg.crosses(
                geoms.loc[ix].geometry, geoms.loc[ix].waterbody
            )

            errors = geoms.groupby(level=0).iswithin.max() == False
            print("{:,} still have no part in a waterbody".format(errors.sum()))

        # calculate total length of within and outside parts
        geoms["length"] = pg.length(geoms.geometry)

        # drop any new segments that are < 1m, these are noise
        print("Dropping {:,} new segments < 1m".format((geoms.length < 1).sum()))
        geoms = geoms.loc[geoms.length >= 1].copy()

        if len(geoms) > 1:
            length = geoms.groupby(["lineID", "wbID", "iswithin"]).agg(
                {"length": "sum", "origLength": "first"}
            )

            # Anything within 1 meter of original length is considered unchanged
            # This is so that we ignore slivers
            length["unchanged"] = (length.origLength - length["length"]).abs() < 1
            unchanged = (
                length[["unchanged"]]
                .reset_index()
                .groupby(["lineID", "wbID"])
                .unchanged.max()
                .rename("max_unchanged")
            )
            unchanged = (
                length.reset_index().set_index(["lineID", "wbID"]).join(unchanged)
            )
            is_within = (
                unchanged.loc[unchanged.max_unchanged]
                .reset_index()
                .set_index(["lineID", "wbID"])
                .iswithin
            )

            # For any that are unchanged and NOT within waterbodies,
            # remove them from wb_joins
            ix = is_within.loc[~is_within].index
            wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy()

            # Remove any that are unchanged from intersection analysis
            geoms = geoms.loc[~geoms.index.isin(is_within.index)].copy()

            print(
                "Created {:,} new flowlines by splitting {:,} flowlines at waterbody edges in {:.2f}".format(
                    len(geoms),
                    len(geoms.index.get_level_values(0).unique()),
                    time() - cut_start,
                )
            )

            if len(geoms) > 1:
                ### These are our final new lines to add
                # remove their lineIDs from flowlines and append
                # replace their outer joins to these ones and add intermediates

                # Join in previous line information from flowlines
                new_lines = (
                    geoms[["geometry", "length", "iswithin"]]
                    .reset_index()
                    .set_index("lineID")
                    .join(flowlines.drop(columns=["geometry", "length", "sinuosity"]))
                    .reset_index()
                    .rename(columns={"lineID": "origLineID", "iswithin": "waterbody"})
                )

                error = (
                    new_lines.groupby("origLineID").wbID.unique().apply(len).max() > 1
                )
                if error:
                    # Watch for errors - if a flowline is cut by multiple waterbodies
                    # there will be problems with our logic for splicing in new lines
                    # also - our intersection logic above is wrong
                    print(
                        """\n========\n
                    MAJOR LOGIC ERROR: multiple waterbodies associated with a single flowline that as been cut.
                    \n========\n
                    """
                    )

                # recalculate length and sinuosity
                new_lines["length"] = pg.length(new_lines.geometry).astype("float32")
                new_lines["sinuosity"] = calculate_sinuosity(new_lines.geometry).astype(
                    "float32"
                )

                # calculate new IDS
                next_segment_id = int(flowlines.index.max() + 1)
                new_lines["lineID"] = next_segment_id + new_lines.index
                new_lines.lineID = new_lines.lineID.astype("uint32")

                ### Update waterbody joins
                # remove joins replaced by above
                ix = new_lines.set_index(["origLineID", "wbID"]).index
                wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy()

                # add new joins
                wb_joins = (
                    wb_joins.reset_index()
                    .append(
                        new_lines.loc[new_lines.waterbody, ["lineID", "wbID"]],
                        ignore_index=True,
                        sort=False,
                    )
                    .set_index(["lineID", "wbID"])
                )

                ### Update flowline joins
                # transform new lines to create new joins
                l = new_lines.groupby("origLineID").lineID
                # the first new line per original line is the furthest upstream, so use its
                # ID as the new downstream ID for anything that had this origLineID as its downstream
                first = l.first().rename("new_downstream_id")
                # the last new line per original line is the furthest downstream...
                last = l.last().rename("new_upstream_id")

                # Update existing joins with the new lineIDs we created at the upstream or downstream
                # ends of segments we just created
                joins = update_joins(
                    joins,
                    first,
                    last,
                    downstream_col="downstream_id",
                    upstream_col="upstream_id",
                )

                ### Create new line joins for any that weren't inserted above
                # Transform all groups of new line IDs per original lineID, wbID
                # into joins structure
                pairs = lambda a: pd.Series(zip(a[:-1], a[1:]))
                new_joins = (
                    new_lines.groupby(["origLineID", "wbID"])
                    .lineID.apply(pairs)
                    .apply(pd.Series)
                    .reset_index()
                    .rename(columns={0: "upstream_id", 1: "downstream_id"})
                    .join(
                        flowlines[["NHDPlusID", "loop"]].rename(
                            columns={"NHDPlusID": "upstream"}
                        ),
                        on="origLineID",
                    )
                )
                # NHDPlusID is same for both sides
                new_joins["downstream"] = new_joins.upstream
                new_joins["type"] = "internal"
                new_joins = new_joins[
                    [
                        "upstream",
                        "downstream",
                        "upstream_id",
                        "downstream_id",
                        "type",
                        "loop",
                    ]
                ]

                joins = joins.append(
                    new_joins, ignore_index=True, sort=False
                ).sort_values(["downstream_id", "upstream_id"])

                ### Update flowlines
                # remove originals now replaced by cut versions here
                flowlines = (
                    flowlines.loc[~flowlines.index.isin(new_lines.origLineID)]
                    .reset_index()
                    .append(
                        new_lines[["lineID"] + list(flowlines.columns) + ["waterbody"]],
                        ignore_index=True,
                        sort=False,
                    )
                    .sort_values("lineID")
                    .set_index("lineID")
                )

                # End cut geometries

    # Update waterbody bool for other flowlines based on those that completely intersected
    # above
    flowlines.loc[
        flowlines.index.isin(wb_joins.index.get_level_values(0).unique()), "waterbody"
    ] = True
    flowlines.waterbody = flowlines.waterbody.fillna(False)

    ### Update waterbodies and calculate flowline stats
    wb_joins = wb_joins.reset_index()
    stats = (
        wb_joins.join(flowlines.length.rename("flowlineLength"), on="lineID")
        .groupby("wbID")
        .flowlineLength.sum()
        .astype("float32")
    )
    waterbodies = waterbodies.loc[waterbodies.index.isin(wb_joins.wbID)].join(stats)

    print("Done cutting flowlines by waterbodies in {:.2f}s".format(time() - start))

    return flowlines, joins, waterbodies, wb_joins
Example #25
0
    assert tree.query(empty).size == 0


@pytest.mark.parametrize(
    "geometry,expected",
    [
        # points do not intersect
        (pygeos.points(0.5, 0.5), []),
        # points intersect
        (pygeos.points(1, 1), [1]),
        # box contains points
        (box(0, 0, 1, 1), [0, 1]),
        # box contains points
        (box(5, 5, 15, 15), [5, 6, 7, 8, 9]),
        # envelope of buffer contains points
        (pygeos.buffer(pygeos.points(3, 3), 1), [2, 3, 4]),
        # envelope of points contains points
        (pygeos.multipoints([[5, 7], [7, 5]]), [5, 6, 7]),
    ],
)
def test_query_points(tree, geometry, expected):
    assert_array_equal(tree.query(geometry), expected)


@pytest.mark.parametrize(
    "geometry,expected",
    [
        # point intersects first line
        (pygeos.points(0, 0), [0]),
        (pygeos.points(0.5, 0.5), [0]),
        # point within envelope of first line
Example #26
0
def find_dam_face_from_waterbody(waterbody, drain_pt):
    total_area = pg.area(waterbody)
    ring = pg.get_exterior_ring(pg.normalize(waterbody))
    total_length = pg.length(ring)
    num_pts = pg.get_num_points(ring) - 1  # drop closing coordinate
    vertices = pg.get_point(ring, range(num_pts))

    ### Extract line segments that are no more than 1/3 coordinates of polygon
    # starting from the vertex nearest the drain
    # note: lower numbers are to the right
    tree = pg.STRtree(vertices)
    ix = tree.nearest(drain_pt)[1][0]
    side_width = min(num_pts // 3, MAX_SIDE_PTS)
    left_ix = ix + side_width
    right_ix = ix - side_width

    # extract these as a left-to-write line;
    pts = vertices[max(right_ix, 0):min(num_pts, left_ix)][::-1]
    if left_ix >= num_pts:
        pts = np.append(vertices[0:left_ix - num_pts][::-1], pts)

    if right_ix < 0:
        pts = np.append(pts, vertices[num_pts + right_ix:num_pts][::-1])

    coords = pg.get_coordinates(pts)

    if len(coords) > 2:
        # first run a simplification process to extract the major shape and bends
        # then run the straight line algorithm
        simp_coords, simp_ix = simplify_vw(
            coords, min(MAX_SIMPLIFY_AREA, total_area / 100))

        if len(simp_coords) > 2:
            keep_coords, ix = extract_straight_segments(
                simp_coords, max_angle=MAX_STRAIGHT_ANGLE, loops=5)
            keep_ix = simp_ix.take(ix)

        else:
            keep_coords = simp_coords
            keep_ix = simp_ix

    else:
        keep_coords = coords
        keep_ix = np.arange(len(coords))

    ### Calculate the length of each run and drop any that are not sufficiently long
    lengths = segment_length(keep_coords)
    ix = (lengths >= MIN_DAM_WIDTH) & (lengths / total_length <
                                       MAX_WIDTH_RATIO)

    pairs = np.dstack([keep_ix[:-1][ix], keep_ix[1:][ix]])[0]

    # since ranges are ragged, we have to do this in a loop instead of vectorized
    segments = []
    for start, end in pairs:
        segments.append(pg.linestrings(coords[start:end + 1]))

    segments = np.array(segments)

    # only keep the segments that are close to the drain
    segments = segments[
        pg.intersects(segments, pg.buffer(drain_pt, MAX_DRAIN_DIST)), ]

    if not len(segments):
        return segments

    # only keep those where the drain is interior to the line
    pos = pg.line_locate_point(segments, drain_pt)
    lengths = pg.length(segments)

    ix = (pos >= MIN_INTERIOR_DIST) & (pos <= (lengths - MIN_INTERIOR_DIST))

    return segments[ix]
Example #27
0
def create_voronoi(
        points: Sequence[pygeos.Geometry]) -> Sequence[pygeos.Geometry]:
    mp = pygeos.multipoints(points)
    polys = pygeos.get_parts(pygeos.voronoi_polygons(mp))
    convex_hull = pygeos.buffer(pygeos.convex_hull(mp), 2)
    return pygeos.intersection(convex_hull, polys)
Example #28
0
def test_minimum_bounding_circle_all_types(geometry):
    actual = pygeos.minimum_bounding_circle([geometry, geometry])
    assert actual.shape == (2,)
    assert actual[0] is None or isinstance(actual[0], Geometry)

    actual = pygeos.minimum_bounding_circle(None)
    assert actual is None


@pytest.mark.skipif(pygeos.geos_version < (3, 8, 0), reason="GEOS < 3.8")
@pytest.mark.parametrize(
    "geometry, expected",
    [
        (
            pygeos.Geometry("POLYGON ((0 5, 5 10, 10 5, 5 0, 0 5))"),
            pygeos.buffer(pygeos.Geometry("POINT (5 5)"), 5),
        ),
        (
            pygeos.Geometry("LINESTRING (1 0, 1 10)"),
            pygeos.buffer(pygeos.Geometry("POINT (1 5)"), 5),
        ),
        (
            pygeos.Geometry("MULTIPOINT (2 2, 4 2)"),
            pygeos.buffer(pygeos.Geometry("POINT (3 2)"), 1),
        ),
        (
            pygeos.Geometry("POINT (2 2)"),
            pygeos.Geometry("POINT (2 2)"),
        ),
        (
            pygeos.Geometry("GEOMETRYCOLLECTION EMPTY"),
Example #29
0
def enclosures(primary_barriers,
               limit=None,
               additional_barriers=None,
               enclosure_id="eID"):
    """
    Generate enclosures based on passed barriers.

    Enclosures are areas enclosed from all sides by at least one type of
    a barrier. Barriers are typically roads, railways, natural features
    like rivers and other water bodies or coastline. Enclosures are a
    result of polygonization of the  ``primary_barrier`` and ``limit`` and its
    subdivision based on additional_barriers.

    Parameters
    ----------
    primary_barriers : GeoDataFrame, GeoSeries
        GeoDataFrame or GeoSeries containing primary barriers.
        (Multi)LineString geometry is expected.
    limit : GeoDataFrame, GeoSeries (default None)
        GeoDataFrame or GeoSeries containing external limit of enclosures,
        i.e. the area which gets partitioned. If None is passed,
        the internal area of ``primary_barriers`` will be used.
    additional_barriers : GeoDataFrame
        GeoDataFrame or GeoSeries containing additional barriers.
        (Multi)LineString geometry is expected.
    enclosure_id : str (default 'eID')
        name of the enclosure_id (to be created).

    Returns
    -------
    enclosures : GeoDataFrame
       GeoDataFrame containing enclosure geometries and enclosure_id

    Examples
    --------
    >>> enclosures = mm.enclosures(streets, admin_boundary, [railway, rivers])

    """
    if limit is not None:
        if limit.geom_type.isin(["Polygon", "MultiPolygon"]).any():
            limit = limit.boundary
        barriers = pd.concat([primary_barriers.geometry, limit.geometry])
    else:
        barriers = primary_barriers
    unioned = barriers.unary_union
    polygons = polygonize(unioned)
    enclosures = gpd.GeoSeries(list(polygons), crs=primary_barriers.crs)

    if additional_barriers is not None:
        if not isinstance(additional_barriers, list):
            raise TypeError(
                "`additional_barriers` expects a list of GeoDataFrames or GeoSeries."
                f"Got {type(additional_barriers)}.")
        additional = pd.concat([gdf.geometry for gdf in additional_barriers])

        inp, res = enclosures.sindex.query_bulk(additional.geometry,
                                                predicate="intersects")
        unique = np.unique(res)

        new = []

        for i in unique:
            poly = enclosures.values.data[i]  # get enclosure polygon
            crossing = inp[res == i]  # get relevant additional barriers
            buf = pygeos.buffer(poly, 0.01)  # to avoid floating point errors
            crossing_ins = pygeos.intersection(
                buf, additional.values.data[crossing]
            )  # keeping only parts of additional barriers within polygon
            union = pygeos.union_all(
                np.append(crossing_ins, pygeos.boundary(poly)))  # union
            polygons = np.array(list(polygonize(
                _pygeos_to_shapely(union))))  # polygonize
            within = pygeos.covered_by(
                pygeos.from_shapely(polygons),
                buf)  # keep only those within original polygon
            new += list(polygons[within])

        final_enclosures = (gpd.GeoSeries(enclosures).drop(unique).append(
            gpd.GeoSeries(new)).reset_index(drop=True)).set_crs(
                primary_barriers.crs)

        return gpd.GeoDataFrame({enclosure_id: range(len(final_enclosures))},
                                geometry=final_enclosures)

    return gpd.GeoDataFrame({enclosure_id: range(len(enclosures))},
                            geometry=enclosures)
Example #30
0
def global_shapefiles(data_path, regionalized=False, assigned_level=1):
    """ 
    This function will simplify shapes and add necessary columns, to make further processing more quickly
    
    For now, we will make use of the latest GADM data, split by level: https://gadm.org/download_world.html

    Optional Arguments:
        *regionalized*  : Default is **False**. Set to **True** will also create the global_regions.shp file.
    """

    gadm_path = os.path.join(data_path, 'GADM36', 'gadm36_levels.gpkg')
    cleaned_shapes_path = os.path.join(data_path, 'cleaned_shapes')

    if not os.path.exists(cleaned_shapes_path):
        os.makedirs(cleaned_shapes_path)

    # path to country GADM file
    if regionalized == False:

        # load country file
        gadm_level0 = pandas.DataFrame(
            geopandas.read_file(gadm_path, layer='level0'))

        #convert to pygeos
        tqdm.pandas(desc='Convert geometries to pygeos')
        gadm_level0['geometry'] = gadm_level0.geometry.progress_apply(
            lambda x: pygeos.from_shapely(x))

        # remove antarctica, no roads there anyways
        gadm_level0 = gadm_level0.loc[~gadm_level0['NAME_0'].
                                      isin(['Antarctica'])]

        # remove tiny shapes to reduce size substantially
        tqdm.pandas(desc='Remove tiny shapes')
        gadm_level0['geometry'] = gadm_level0.progress_apply(
            remove_tiny_shapes, axis=1)

        #simplify geometry
        tqdm.pandas(desc='Simplify geometry')
        gadm_level0.geometry = gadm_level0.geometry.progress_apply(
            lambda x: pygeos.simplify(pygeos.buffer(
                pygeos.simplify(x, tolerance=0.005, preserve_topology=True),
                0.01),
                                      tolerance=0.005,
                                      preserve_topology=True))

        #save to new country file

        glob_ctry_path = os.path.join(cleaned_shapes_path,
                                      'global_countries.gpkg')
        tqdm.pandas(desc='Convert geometries back to shapely')
        gadm_level0.geometry = gadm_level0.geometry.progress_apply(
            lambda x: loads(pygeos.to_wkb(x)))
        geopandas.GeoDataFrame(gadm_level0).to_file(glob_ctry_path,
                                                    layer='level0',
                                                    driver="GPKG")

    else:
        # this is dependent on the country file, so check whether that one is already created:
        glob_ctry_path = os.path.join(cleaned_shapes_path,
                                      'global_countries.gpkg')
        if os.path.exists(glob_ctry_path):
            gadm_level0 = geopandas.read_file(os.path.join(glob_ctry_path),
                                              layer='level0')
        else:
            print('ERROR: You need to create the country file first')
            return None

        # load region file
        gadm_level_x = pandas.DataFrame(
            geopandas.read_file(gadm_path,
                                layer='level{}'.format(assigned_level)))

        #convert to pygeos
        tqdm.pandas(desc='Convert geometries to pygeos')
        gadm_level_x['geometry'] = gadm_level_x.geometry.progress_apply(
            lambda x: pygeos.from_shapely(x))

        # remove tiny shapes to reduce size substantially
        tqdm.pandas(desc='Remove tiny shapes')
        gadm_level_x['geometry'] = gadm_level_x.progress_apply(
            remove_tiny_shapes, axis=1)

        #simplify geometry
        tqdm.pandas(desc='Simplify geometry')
        gadm_level_x.geometry = gadm_level_x.geometry.progress_apply(
            lambda x: pygeos.simplify(pygeos.buffer(
                pygeos.simplify(x, tolerance=0.005, preserve_topology=True),
                0.01),
                                      tolerance=0.005,
                                      preserve_topology=True))

        # add some missing geometries from countries with no subregions
        get_missing_countries = list(
            set(list(gadm_level0.GID_0.unique())).difference(
                list(gadm_level_x.GID_0.unique())))

        #TO DO: GID_2 and lower tiers should first be filled by a tier above, rather then by the country file
        mis_country = gadm_level0.loc[gadm_level0['GID_0'].isin(
            get_missing_countries)]  #
        if assigned_level == 1:
            mis_country['GID_1'] = mis_country['GID_0'] + '.' + str(
                0) + '_' + str(1)
        elif assigned_level == 2:
            mis_country['GID_2'] = mis_country['GID_0'] + '.' + str(
                0) + '.' + str(0) + '_' + str(1)
        elif assigned_level == 3:
            mis_country['GID_3'] = mis_country['GID_0'] + '.' + str(
                0) + '.' + str(0) + '.' + str(0) + '_' + str(1)
        elif assigned_level == 4:
            mis_country['GID_4'] = mis_country['GID_0'] + '.' + str(
                0) + '.' + str(0) + '.' + str(0) + '.' + str(0) + '_' + str(1)
        elif assigned_level == 5:
            mis_country['GID_5'] = mis_country['GID_0'] + '.' + str(
                0) + '.' + str(0) + '.' + str(0) + '.' + str(0) + '.' + str(
                    0) + '_' + str(1)

        tqdm.pandas(desc='Convert geometries back to shapely')
        gadm_level_x.geometry = gadm_level_x.geometry.progress_apply(
            lambda x: loads(pygeos.to_wkb(x)))

        # concat missing country to gadm levels
        gadm_level_x = geopandas.GeoDataFrame(
            pandas.concat([gadm_level_x, mis_country], ignore_index=True))
        gadm_level_x.reset_index(drop=True, inplace=True)

        #save to new country file
        gadm_level_x.to_file(os.path.join(cleaned_shapes_path,
                                          'global_regions.gpkg'),
                             layer='level{}'.format(assigned_level),
                             driver="GPKG")