def test_distance_nan():
    actual = pygeos.distance(
        np.array([point, np.nan, np.nan, point, None, None, point]),
        np.array([np.nan, point, np.nan, None, point, None, point]),
    )
    assert actual[-1] == 0.0
    assert np.isnan(actual[:-1].astype(np.float)).all()
Exemple #2
0
def boundary_distance(polygon, points):
    """
    Find the distance between a polygon's boundary and an
    array of points.

    Uses either `shapely` or `pygeos` (5-10x faster) as a backend.

    Parameters
    -------------
    polygon : shapely.geometry.Polygon
      Polygon to query
    points : (n, 2) float
      2D points

    Returns
    ------------
    distance : (n,) float
      Minimum distance from each point to polygon boundary
    """

    try:
        import pygeos
        # the pygeos way is 5-10x faster
        pg_points = pygeos.points(*points.T)
        pg_boundary = pygeos.boundary(pygeos.Geometry(polygon.wkt))
        distance = pygeos.distance(pg_boundary, pg_points)
    except BaseException:
        # in pure shapely we have to loop
        inverse = polygon.boundary
        distance = np.array([
            inverse.distance(i) for i in MultiPoint(points)])

    return distance
Exemple #3
0
def calculate_sinuosity(geometries):
    """Calculate sinuosity of the line.

    This is the length of the line divided by the distance between the endpoints of the line.
    By definition, it is always >=1.

    Parameters
    ----------
    geometries : Series or ndarray of pygeos geometries

    Returns
    -------
    Series or ndarray
        sinuosity values
    """

    # By definition, sinuosity should not be less than 1
    first = pg.get_point(geometries, 0)
    last = pg.get_point(geometries, -1)
    straight_line_distance = pg.distance(first, last)

    sinuosity = np.ones((len(geometries), )).astype("float32")

    # if there is no straight line distance there can be no sinuosity
    ix = straight_line_distance > 0

    # by definition, all values must be at least 1, so clip lower bound
    sinuosity[ix] = (pg.length(geometries[ix]) /
                     straight_line_distance).clip(1)

    if isinstance(geometries, pd.Series):
        return pd.Series(sinuosity, index=geometries.index)

    return sinuosity
Exemple #4
0
def naive_compute_distance_similarity_matrix(sorted_detections, ground_truths):
    """Computes a similarity based on euclidean distance between all pairs of geometries in a naive fashion.

    Args:
        sorted_detections (ndarray, list) : A ndarray of detections stored as:

                * Bounding boxes for a given class where each row is a detection stored as:
                  ``[BoundingBox, confidence]``
                * Polygons for a given class where each row is a detection stored as:
                  ``[Polygon, confidence]``
                * Points for a given class where each row is a detection stored as:
                  ``[Point, confidence]``

        ground_truths (ndarray,list) : A ndarray of ground truth stored as:

            * Bounding boxes for a given class where each row is a ground truth stored as:
              ``[BoundingBox]``
            * Polygons for a given class where each row is a ground truth stored as:
              ``[Polygon]``
            * Points for a given class where each row is a ground truth stored as:
              ``[Point]``

    Returns:
        ndarray : An similarity matrix (#detections, #ground truth)
    """

    # We prepare the distance matrix (#detection, #gt)
    distance_matrix = np.zeros((sorted_detections.shape[0], len(ground_truths)))

    # Naive iterative distance matrix construction (Note: we iterate over the sorted detections)
    for k, ground_truth in enumerate(ground_truths):
        for m, detection in enumerate(sorted_detections):
            distance_matrix[m, k] = distance(centroid(detection[0]), centroid(ground_truth[0]))
    return 1 - distance_matrix
Exemple #5
0
    def time_tree_nearest_all_poly_python(self):
        # returns all input points

        # use an arbitrary search tolerance that seems appropriate for the density of
        # geometries
        tolerance = 200
        b = pygeos.buffer(self.points, tolerance, quadsegs=1)
        left, right = self.tree.query_bulk(b)
        dist = pygeos.distance(self.points.take(left),
                               self.polygons.take(right))

        # sort by left, distance
        ix = np.lexsort((right, dist, left))
        left = left[ix]
        right = right[ix]
        dist = dist[ix]

        run_start = np.r_[True, left[:-1] != left[1:]]
        run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

        mins = dist[run_start]

        # spread to rest of array so we can extract out all within each group that match
        all_mins = np.repeat(mins, run_counts)
        ix = dist == all_mins
        left = left[ix]
        right = right[ix]
        dist = dist[ix]
Exemple #6
0
    def _op(self, candidates, x, y):
        point_x = as_points(x)
        point_y = as_points(y)

        areas = np.Inf * np.ones((len(x), len(y)))

        areas[candidates[:, 0],
              candidates[:, 1]] = distance(point_x[candidates[:, 0]],
                                           point_y[candidates[:,
                                                              1]]).squeeze()

        return areas
Exemple #7
0
    def time_tree_nearest_points_equidistant_manual_all(self):
        # This benchmark approximates nearest_all for equidistant results
        # starting from singular nearest neighbors and searching for more
        # within same distance.

        # try to find all equidistant neighbors ourselves given single nearest
        # result
        l, r = self.grid_point_tree.nearest(self.grid_points)
        # calculate distance to nearest neighbor
        dist = pygeos.distance(
            self.grid_points.take(l), self.grid_point_tree.geometries.take(r)
        )
        # include a slight epsilon to ensure nearest are within this radius
        b = pygeos.buffer(self.grid_points, dist + 1e-8)

        # query the tree for others in the same buffer distance
        left, right = self.grid_point_tree.query_bulk(b, predicate="intersects")
        dist = pygeos.distance(
            self.grid_points.take(left), self.grid_point_tree.geometries.take(right)
        )

        # sort by left, distance
        ix = np.lexsort((right, dist, left))
        left = left[ix]
        right = right[ix]
        dist = dist[ix]

        run_start = np.r_[True, left[:-1] != left[1:]]
        run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

        mins = dist[run_start]

        # spread to rest of array so we can extract out all within each group that match
        all_mins = np.repeat(mins, run_counts)
        ix = dist == all_mins
        left = left[ix]
        right = right[ix]
        dist = dist[ix]
Exemple #8
0
 def dist_F_vectorized(self, road, i, points_array):
     ml = []
     for c in points_array:
         if LSDisplacer.DIST == 'MIN':
             m = LSDisplacer._multiline_from_points(c, self.talus_lengths)
         else:
             m = LSDisplacer._lines_from_points(c, self.talus_lengths)
         ml.append(m)
     ml = np.array(ml)
     dists = pygeos.distance(road, ml)
     if LSDisplacer.DIST != 'MIN':
         dists = dists.mean(axis=1)
     dists = np.where(dists > self.buffers[i], 0., self.buffers[i] - dists)
     return dists
Exemple #9
0
def nearest_network_node_list(gdf_admin, gdf_nodes, sg):
    """[summary]

    Args:
        gdf_admin ([type]): [description]
        gdf_nodes ([type]): [description]
        sg ([type]): [description]

    Returns:
        [type]: [description]
    """
    gdf_nodes = gdf_nodes.loc[gdf_nodes.id.isin(sg.vs['name'])]
    gdf_nodes.reset_index(drop=True, inplace=True)
    nodes = {}
    for admin_ in gdf_admin.itertuples():
        nodes[admin_.name] = gdf_nodes.iloc[pygeos.distance(
            (admin_.centroid), gdf_nodes.geometry).idxmin()].id
    return nodes
def near(source, target, distance):
    """Return target geometries within distance of source geometries.

    Only returns records from source that intersected at least one feature in target.

    Parameters
    ----------
    source : Series
        contains pygeos geometries
    target : Series
        contains target pygeos geometries to search against
    distance : number or ndarray
        radius within which to find target geometries.
        If ndarray, must be equal length to source.

    Returns
    -------
    DataFrame
        indexed on original index of source
        includes distance
    """

    # Get all indices from target_values that intersect buffers of input geometry
    idx = sjoin_geometry(pg.buffer(source, distance), target)
    hits = (pd.DataFrame(idx).join(source.rename("geometry"),
                                   how="inner").join(
                                       target.rename("geometry_right"),
                                       on="index_right",
                                       how="inner"))
    # this changes the index if hits is empty, causing downstream problems
    if not len(hits):
        hits.index.name = idx.index.name

    hits["distance"] = pg.distance(hits.geometry,
                                   hits.geometry_right).astype("float32")

    return (hits.drop(columns=["geometry", "geometry_right"]).rename(
        columns={
            "index_right": target.index.name or "index_right"
        }).sort_values(by="distance"))
Exemple #11
0
def _extend_line(coords, target, tolerance, snap=True):
    """
    Extends a line geometry to snap on the target within a tolerance.
    """
    if snap:
        extrapolation = _get_extrapolated_line(
            coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(),
            tolerance,
        )
        int_idx = target.sindex.query(extrapolation, predicate="intersects")
        intersection = pygeos.intersection(
            target.iloc[int_idx].geometry.values.data, extrapolation)
        if intersection.size > 0:
            if len(intersection) > 1:
                distances = {}
                ix = 0
                for p in intersection:
                    distance = pygeos.distance(p, pygeos.points(coords[-1]))
                    distances[ix] = distance
                    ix = ix + 1
                minimal = min(distances.items(), key=operator.itemgetter(1))[0]
                new_point_coords = pygeos.get_coordinates(
                    intersection[minimal])

            else:
                new_point_coords = pygeos.get_coordinates(intersection[0])
            coo = np.append(coords, new_point_coords)
            new = np.reshape(coo, (int(len(coo) / 2), 2))

            return new
        return coords

    extrapolation = _get_extrapolated_line(
        coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(),
        tolerance,
        point=True,
    )
    return np.vstack([coords, extrapolation])
Exemple #12
0
 def time_distance(self):
     pygeos.distance(self.points, self.polygon)
def street_profile(streets, buildings, distance=3, tick_length=50):

    pygeos_lines = streets.geometry.values.data

    list_points = np.empty((0, 2))
    ids = []

    lengths = pygeos.length(pygeos_lines)
    for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)):

        pts = pygeos.line_interpolate_point(
            line, np.linspace(0, length, num=int((length) // distance))
        )  # .1 offset to keep a gap between two segments
        list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0)
        ids += [ix] * len(pts) * 2


    ticks = []
    for num, pt in enumerate(list_points, 1):
        # start chainage 0
        if num == 1:
            angle = _getAngle(pt, list_points[num])
            line_end_1 = _getPoint1(pt, angle, tick_length / 2)
            angle = _getAngle(line_end_1, pt)
            line_end_2 = _getPoint2(line_end_1, angle, tick_length)
            ticks.append([line_end_1, pt])
            ticks.append([line_end_2, pt])

        # everything in between
        if num < len(list_points) - 1:
            angle = _getAngle(pt, list_points[num])
            line_end_1 = _getPoint1(
                list_points[num], angle, tick_length / 2
            )
            angle = _getAngle(line_end_1, list_points[num])
            line_end_2 = _getPoint2(line_end_1, angle, tick_length)
            ticks.append([line_end_1, list_points[num]])
            ticks.append([line_end_2, list_points[num]])

        # end chainage
        if num == len(list_points):
            angle = _getAngle(list_points[num - 2], pt)
            line_end_1 = _getPoint1(pt, angle, tick_length / 2)
            angle = _getAngle(line_end_1, pt)
            line_end_2 = _getPoint2(line_end_1, angle, tick_length)
            ticks.append([line_end_1, pt])
            ticks.append([line_end_2, pt])

    ticks = pygeos.linestrings(ticks)
    inp, res = pygeos.STRtree(ticks).query_bulk(buildings.geometry.values.data, predicate='intersects')
    intersections = pygeos.intersection(ticks[res], buildings.geometry.values.data[inp])
    distances = pygeos.distance(intersections, pygeos.points(list_points[res // 2]))

    dists = np.zeros((len(ticks),))
    dists[:] = np.nan
    dists[res] = distances

    ids = np.array(ids)
    widths = []
    openness = []
    deviations = []
    for i in range(len(streets)):
        f = ids == i
        s = dists[f]
        lefts = s[::2]
        rights = s[1::2]
        left_mean = np.nanmean(lefts) if ~np.isnan(lefts).all() else tick_length / 2
        right_mean = np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2
        widths.append(np.mean([left_mean, right_mean]) * 2)
        openness.append(np.isnan(s).sum() / (f).sum())
        deviations.append(np.nanstd(s))
    
    return (widths, deviations, openness)
def test_distance_missing():
    actual = pygeos.distance(point, None)
    assert np.isnan(actual)
def test_distance_empty():
    actual = pygeos.distance(point, empty)
    assert np.isnan(actual)
Exemple #16
0
    def __init__(self,
                 left,
                 right,
                 heights=None,
                 distance=10,
                 tick_length=50,
                 verbose=True):
        self.left = left
        self.right = right
        self.distance = distance
        self.tick_length = tick_length

        pygeos_lines = left.geometry.values.data

        list_points = np.empty((0, 2))
        ids = []
        end_markers = []

        lengths = pygeos.length(pygeos_lines)
        for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)):

            pts = pygeos.line_interpolate_point(
                line, np.linspace(0, length, num=int((length) // distance)))
            list_points = np.append(list_points,
                                    pygeos.get_coordinates(pts),
                                    axis=0)
            if len(pts) > 1:
                ids += [ix] * len(pts) * 2
                markers = [True] + ([False] * (len(pts) - 2)) + [True]
                end_markers += markers
            elif len(pts) == 1:
                end_markers += [True]
                ids += [ix] * 2

        ticks = []
        for num, (pt, end) in enumerate(zip(list_points, end_markers), 1):
            if end:
                ticks.append([pt, pt])
                ticks.append([pt, pt])

            else:
                angle = self._getAngle(pt, list_points[num])
                line_end_1 = self._getPoint1(pt, angle, tick_length / 2)
                angle = self._getAngle(line_end_1, pt)
                line_end_2 = self._getPoint2(line_end_1, angle, tick_length)
                ticks.append([line_end_1, pt])
                ticks.append([line_end_2, pt])

        ticks = pygeos.linestrings(ticks)

        inp, res = right.sindex.query_bulk(ticks, predicate="intersects")
        intersections = pygeos.intersection(ticks[inp],
                                            right.geometry.values.data[res])
        distances = pygeos.distance(intersections,
                                    pygeos.points(list_points[inp // 2]))
        inp_uni, inp_cts = np.unique(inp, return_counts=True)
        splitter = np.cumsum(inp_cts)[:-1]
        dist_per_res = np.split(distances, splitter)
        inp_per_res = np.split(res, splitter)

        min_distances = []
        min_inds = []
        for dis, ind in zip(dist_per_res, inp_per_res):
            min_distances.append(np.min(dis))
            min_inds.append(ind[np.argmin(dis)])

        dists = np.zeros((len(ticks), ))
        dists[:] = np.nan
        dists[inp_uni] = min_distances

        if heights is not None:
            if isinstance(heights, str):
                heights = self.heights = right[heights]
            elif not isinstance(heights, pd.Series):
                heights = self.heights = pd.Series(heights)

            blgs = np.zeros((len(ticks), ))
            blgs[:] = None
            blgs[inp_uni] = min_inds
            do_heights = True
        else:
            do_heights = False

        ids = np.array(ids)
        widths = []
        openness = []
        deviations = []
        heights_list = []
        heights_deviations_list = []

        for i in range(len(left)):
            f = ids == i
            s = dists[f]
            lefts = s[::2]
            rights = s[1::2]
            left_mean = np.nanmean(
                lefts) if ~np.isnan(lefts).all() else tick_length / 2
            right_mean = (np.nanmean(rights)
                          if ~np.isnan(rights).all() else tick_length / 2)
            widths.append(np.mean([left_mean, right_mean]) * 2)
            openness.append(np.isnan(s).sum() / (f).sum())
            deviations.append(np.nanstd(s))

            if do_heights:
                b = blgs[f]
                h = heights.iloc[b[~np.isnan(b)]]
                heights_list.append(h.mean())
                heights_deviations_list.append(h.std())

        self.w = pd.Series(widths, index=left.index)
        self.wd = pd.Series(deviations, index=left.index).fillna(
            0)  # fill for empty intersections
        self.o = pd.Series(openness, index=left.index).fillna(1)

        if do_heights:
            self.h = pd.Series(heights_list, index=left.index).fillna(
                0)  # fill for empty intersections
            self.hd = pd.Series(heights_deviations_list,
                                index=left.index).fillna(
                                    0)  # fill for empty intersections
            self.p = self.h / self.w.replace(0,
                                             np.nan)  # replace to avoid np.inf
def test_distance():
    actual = pygeos.distance(*point_polygon_testdata)
    expected = [2 * 2**0.5, 2**0.5, 0, 0, 0, 2**0.5]
    np.testing.assert_allclose(actual, expected)
def create_drain_points(flowlines, joins, waterbodies, wb_joins):
    """Create drain points from furthest downstream point of flowlines that overlap with waterbodies.

    WARNING: If multiple flowlines intersect at the drain point, there will be multiple drain points at the same location

    Parameters
    ----------
    flowlines : GeoDataFrame
    joins : DataFrame
        flowline joins
    waterbodies : GeoDataFrame
    wb_joins : DataFrame
        waterbody / flowline joins

    Returns
    -------
    GeoDataFrame
        Drain points dataframe
    """
    start = time()

    wb_atts = waterbodies[["altered", "km2", "flowlineLength"]].copy()

    tmp_flowlines = flowlines[[
        "geometry",
        "FCode",
        "FType",
        "MaxElevSmo",
        "MinElevSmo",
        "Slope",
        "TotDASqKm",
        "StreamOrde",
        "sizeclass",
        "HUC4",
        "loop",
    ]].rename(columns={
        "FCode": "lineFCode",
        "FType": "lineFType"
    })

    ### Find the downstream most point(s) on the flowline for each waterbody
    # This is used for snapping barriers, if possible.
    # Drop any where there is no flowline below the drain point (often pipelines
    # that were removed)
    tmp = wb_joins[["lineID", "wbID"]].set_index("lineID")
    drains = (joins.loc[joins.upstream_id.isin(wb_joins.lineID.unique())
                        & (joins.downstream_id != 0)].join(
                            tmp.wbID.rename("upstream_wbID"),
                            on="upstream_id").join(
                                tmp.wbID.rename("downstream_wbID"),
                                on="downstream_id"))

    # Only keep those that terminate outside the same waterbody as the upstream end
    drains = drains.loc[drains.upstream_wbID != drains.downstream_wbID].copy()

    # Join in stats from waterbodies and geometries from flowlines
    drain_pts = (wb_joins.loc[wb_joins.lineID.isin(
        drains.upstream_id.unique())].join(
            wb_atts,
            on="wbID",
        ).join(
            tmp_flowlines[["geometry", "loop", "TotDASqKm"]],
            on="lineID",
        ).reset_index(drop=True))

    # create a point from the last coordinate, which is the furthest one downstream
    drain_pts.geometry = pg.get_point(drain_pts.geometry.values.data, -1)

    # drop any that are downstream terminals; these are most likely waterbodies
    # that do not have further downstream networks (e.g., flow to ocean)
    ix = joins.loc[joins.upstream_id.isin(drain_pts.lineID)
                   & (joins.downstream_id == 0)].upstream_id
    drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)].copy()

    ### Find all drain points that share the same geometry.
    # These are most likely multiple segments that terminate in same drain point,
    # so we need to assign them their common downstream ID instead so that
    # snapping dams to these works properly later (otherwise snapped to only one of segments)
    drain_pts["hash"] = pd.util.hash_array(
        pg.to_wkb(drain_pts.geometry.values.data))
    s = drain_pts.groupby("hash").size()
    ix = drain_pts.hash.isin(s[s > 1].index)
    if ix.sum():
        print(f"Deduplicating {ix.sum():,} duplicate drain points")
        # find downstream_id for each of these, and deduplicate if there are multiple
        # downstreams, favoring the non-loops
        j = (joins.loc[joins.upstream_id.isin(drain_pts.loc[ix].lineID)
                       & (joins.downstream_id != 0),
                       ["upstream_id", "downstream_id", "loop"], ].sort_values(
                           by=["upstream_id", "loop"], ascending=True).groupby(
                               "upstream_id").first().downstream_id)

        drain_pts = drain_pts.join(j, on="lineID")

        # for those at same location that share the same downstream line, use that line instead
        s = (drain_pts.loc[drain_pts.downstream_id.notnull()].groupby(
            "downstream_id").size())
        ix = drain_pts.downstream_id.isin(s[s > 1].index.astype("uint32"))
        drain_pts.loc[ix, "lineID"] = drain_pts.loc[ix].downstream_id.astype(
            "uint32")
        # update the line properties to match that lineID
        lids = drain_pts.loc[ix].lineID.values
        drain_pts.loc[ix, "flowlineLength"] = flowlines.loc[lids,
                                                            "length"].values
        drain_pts.loc[ix, "loop"] = flowlines.loc[lids].loop.values
        drain_pts.loc[ix, "TotDASqKm"] = flowlines.loc[lids].TotDASqKm.values
        drain_pts = drain_pts.drop(columns=["downstream_id"])

    # keep the first unique drain point and sort the rest so they are oriented
    # from upstream to downstream
    drain_pts = (drain_pts.drop(columns=["hash"]).groupby(
        ["lineID", "wbID"]).first().sort_values(by="TotDASqKm",
                                                ascending=True).reset_index())

    drain_pts = gp.GeoDataFrame(drain_pts,
                                geometry="geometry",
                                crs=flowlines.crs)

    ### Deduplicate drains by network topology
    # Find the downstream-most drains for waterbodies when there are multiple distinct ones per waterbody.
    # These may result from flowlines that cross in and out of waterbodies multiple
    # times (not valid), or there may be drains on downstream loops
    # (esp. at dams) (valid).

    dups = drain_pts.groupby("wbID").size() > 1
    if dups.sum():
        print(
            f"Found {dups.sum():,} waterbodies with multiple drain points; cleaing up"
        )
        # find all waterbodies that have duplicate drains
        ix = drain_pts.wbID.isin(dups[dups].index)
        wb_ids = drain_pts.loc[ix].wbID.unique()
        # find all corresponding line IDs for these waterbodies
        line_ids = wb_joins.loc[wb_joins.wbID.isin(wb_ids)].lineID.unique()
        lines_per_wb = (drain_pts.loc[drain_pts.wbID.isin(wb_ids)].groupby(
            "wbID").lineID.unique())
        # search within 20 degrees removed from ids; this hopefully
        # picks up any gaps where lines exit waterbodies for a ways then re-enter
        # some floodplain areas have very big loops outside waterbody
        pairs = find_joins(
            joins,
            line_ids,
            downstream_col="downstream_id",
            upstream_col="upstream_id",
            expand=20,
        )[["upstream_id", "downstream_id"]]

        # remove any terminal points
        pairs = pairs.loc[(pairs.upstream_id != 0)
                          & (pairs.downstream_id != 0)]

        # create a directed graph facing DOWNSTREAM
        graph = DirectedGraph(pairs,
                              source="upstream_id",
                              target="downstream_id")
        # find all lines that are upstream of other lines
        # these are "parents" in the directed graph
        upstreams = graph.find_all_parents(lines_per_wb.values)
        ix = pd.Series(upstreams).explode().dropna().unique()
        print(
            f"Dropping {len(ix):,} drains that are upstream of other drains in the same waterbody"
        )
        drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)]

    ### check if drain points are on a loop and very close to the junction
    # of the loop and nonloop (e.g., Hoover Dam, HUC2 == 15)
    drain_pts["snap_to_junction"] = False
    drain_pts["snap_dist"] = 0

    drains_by_wb = drain_pts.groupby("wbID").size()
    multiple_drain_wb = drains_by_wb[drains_by_wb > 1].index

    # limit this to drain points on loops where there are multiple drains per waterbody
    loop_pts = drain_pts.loc[drain_pts.loop &
                             (drain_pts.wbID.isin(multiple_drain_wb))].copy()

    # search within 3 degrees removed from ids; this hopefully
    # picks up any downstream junctions
    pairs = find_joins(
        joins,
        loop_pts.lineID.unique(),
        downstream_col="downstream_id",
        upstream_col="upstream_id",
        expand=3,
    )[["upstream_id", "downstream_id"]]

    # drop endpoints
    pairs = pairs.loc[(pairs.upstream_id != 0)
                      & (pairs.downstream_id != 0)].copy()

    # find all junctions that have > 1 flowline upstream of them
    grouped = pairs.groupby("downstream_id").size()
    downstream_junctions = grouped[grouped > 1].index
    # extract upstream endoint for each junction line
    downstream_junction_pts = pd.Series(
        pg.get_point(flowlines.loc[downstream_junctions].geometry.values.data,
                     0),
        index=downstream_junctions,
    )
    # find the nearest junctions within 5m tolerance of drain points on loops
    tree = pg.STRtree(downstream_junction_pts.values.data)
    left, right = tree.nearest_all(loop_pts.geometry.values.data,
                                   max_distance=5)

    # make sure they are connected on the network
    g = DirectedGraph(pairs, source="upstream_id", target="downstream_id")
    ix = g.is_reachable(loop_pts.iloc[left].lineID.values,
                        downstream_junction_pts.iloc[right].index)
    left = left[ix]
    right = right[ix]

    if len(left):
        print(
            f"Found {len(left)} drains on loops within 5m upstream of a junction, updating them..."
        )
        # NOTE: these are attributed to the flowline that is DOWNSTREAM of the junction point
        # whereas other drains are attributed to the flowline upstream of themselves
        ix = loop_pts.index.take(left)
        drain_pts.loc[ix, "snap_to_junction"] = True
        drain_pts.loc[ix, "snap_dist"] = pg.distance(
            drain_pts.loc[ix].geometry.values.data,
            downstream_junction_pts.iloc[right].values,
        )
        drain_pts.loc[ix, "lineID"] = downstream_junction_pts.iloc[right].index
        drain_pts.loc[ix,
                      "geometry"] = downstream_junction_pts.iloc[right].values

    ### Extract the drain points of upstream headwaters waterbodies
    # these are flowlines that originate at a waterbody
    wb_geom = waterbodies.loc[waterbodies.flowlineLength == 0].geometry
    wb_geom = pd.Series(wb_geom.values.data, index=wb_geom.index)
    # take only the upstream most point
    tmp_flowline_pts = tmp_flowlines[["geometry", "loop", "TotDASqKm"]].copy()
    tmp_flowline_pts["geometry"] = pg.get_point(flowlines.geometry.values.data,
                                                0)
    fl_pt = pd.Series(tmp_flowline_pts.geometry.values.data,
                      index=tmp_flowline_pts.index)
    headwaters = (sjoin_geometry(
        wb_geom, fl_pt, predicate="intersects").rename("lineID").reset_index())
    headwaters = (headwaters.join(
        wb_atts,
        on="wbID",
    ).join(
        tmp_flowline_pts,
        on="lineID",
    ).reset_index(drop=True))
    headwaters["headwaters"] = True
    headwaters["snap_to_junction"] = False
    headwaters["snap_dist"] = 0
    print(
        f"Found {len(headwaters):,} headwaters waterbodies, adding drain points for these too"
    )

    drain_pts["headwaters"] = False
    drain_pts = drain_pts.append(headwaters, sort=False,
                                 ignore_index=True).reset_index(drop=True)

    # join in line properties
    drain_pts = drain_pts.drop(columns=["loop", "TotDASqKm"]).join(
        tmp_flowlines.drop(columns=["geometry"]), on="lineID")

    # calculate unique index
    huc_id = drain_pts["HUC4"].astype("uint16") * 1000000
    drain_pts["drainID"] = drain_pts.index.values.astype("uint32") + huc_id

    # Convert back to GeoDataFrame; above steps make it into a DataFrame
    drain_pts = gp.GeoDataFrame(drain_pts,
                                geometry="geometry",
                                crs=flowlines.crs)
    drain_pts.wbID = drain_pts.wbID.astype("uint32")
    drain_pts.lineID = drain_pts.lineID.astype("uint32")
    drain_pts.flowlineLength = drain_pts.flowlineLength.astype("float32")

    print("Done extracting {:,} waterbody drain points in {:.2f}s".format(
        len(drain_pts),
        time() - start))

    return drain_pts
    # some drains are at exact same point as extracted flowline crossing point
    tmp["same_subnet"] = tmp.lineID == tmp.drainLineID
    ix = ~tmp.same_subnet
    tmp.loc[ix, "same_subnet"] = g.is_reachable(
        tmp.loc[ix].lineID.values, tmp.loc[ix].drainLineID.values, 4
    )
    # try from other direction
    ix = ~tmp.same_subnet
    tmp.loc[ix, "same_subnet"] = g.is_reachable(
        tmp.loc[ix].drainLineID.values, tmp.loc[ix].lineID.values, 4
    )

    tmp = tmp.loc[tmp.same_subnet].copy()
    # take the closest drain to the crossing point if there are multiple on the
    # same flowline
    tmp["dist"] = pg.distance(tmp.geometry.values.data, tmp.pt.values.data)
    use_drains = (
        tmp.sort_values(by=["damPtID", "dist"], ascending=True)
        .drop(columns=["same_subnet", "dist", "pt", "lineID"])
        .groupby("damPtID")
        .first()
    )

    dams = dams.join(
        use_drains[["drainID", "wbID", "drainLineID", "geometry"]].rename(
            columns={"geometry": "drain"}
        )
    )
    ix = dams.drainID.notnull()
    print(
        f"Found {ix.sum():,} dams associated with waterbodies in {time() - join_start:,.2f}s"
Exemple #20
0
def snap_to_large_waterbodies(df, to_snap):
    """Snap to nearest large waterbody.

    NOTE: only run this on dams that could not snap to flowlines, to avoid
    moving them far away.

    This captures large dam centerpoints that are not near enough to flowlines.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """
    wb = from_geofeather(nhd_dir / "merged" / "large_waterbodies.feather").set_index(
        "wbID"
    )
    drains = (
        from_geofeather(nhd_dir / "merged" / "large_waterbody_drain_points.feather")
        .rename(columns={"id": "drainID"})
        .set_index("drainID")
    )

    near_wb = nearest(to_snap.geometry, pg.boundary(wb.geometry), NEAR_WB_TOLERANCE)
    near_wb = (
        pd.DataFrame(near_wb)
        .join(to_snap.geometry)
        .join(
            drains.reset_index()
            .set_index("wbID")[["geometry", "drainID", "lineID"]]
            .rename(columns={"geometry": "drain"}),
            on="wbID",
        )
        .dropna(subset=["drain"])
    )
    near_wb["snap_dist"] = pg.distance(near_wb.geometry, near_wb.drain)

    # drop any that are > 250 m away, these aren't useful
    near_wb = near_wb.loc[near_wb.snap_dist <= WB_DRAIN_MAX_TOLERANCE].copy()

    # take the closest drain point
    near_wb = near_wb.sort_values(by="snap_dist").groupby(level=0).first()

    ix = near_wb.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_wb.drain
    df.loc[ix, "snap_dist"] = near_wb.distance
    df.loc[ix, "snap_ref_id"] = near_wb.drainID
    df.loc[ix, "lineID"] = near_wb.lineID
    df.loc[ix, "wbID"] = near_wb.wbID

    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        WB_DRAIN_MAX_TOLERANCE,
        "m tolerance of drain point of large waterbody that is within ",
        NEAR_WB_TOLERANCE,
        "m of dam",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within {}m of large waterbodies and within {}m of the drain point of those waterbodies".format(
            len(near_wb), NEAR_WB_TOLERANCE, WB_DRAIN_MAX_TOLERANCE
        )
    )

    return df, to_snap
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
    """
    Extracts flowlines data from NHDPlusHR data product.
    Extract flowlines from NHDPlusHR data product, joins to VAA table,
    and filters out coastlines.
    Extracts joins between flowlines, and filters out coastlines.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.
    extra_cols: list
        List of extra field names to extract from NHDFlowline layer

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (flowlines, joins)
    """

    ### Read in flowline data and convert to data frame
    print("Reading flowlines")
    flowline_cols = FLOWLINE_COLS + extra_flowline_cols
    df = read_dataframe(
        gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols],
    )

    # Index on NHDPlusID for easy joins to other NHD data
    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df = df.set_index(["NHDPlusID"], drop=False)

    # convert MultiLineStrings to LineStrings (all have a single linestring)
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)

    print("making valid and projecting to target projection")
    df.geometry = make_valid(df.geometry.values.data)
    df = df.to_crs(target_crs)
    print(f"Read {len(df):,} flowlines")

    ### Read in VAA and convert to data frame
    # NOTE: not all records in Flowlines have corresponding records in VAA
    # we drop those that do not since we need these fields.
    print("Reading VAA table and joining...")
    vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS])

    vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64")
    vaa_df = vaa_df.set_index(["NHDPlusID"])
    df = df.join(vaa_df, how="inner")
    print(f"{len(df):,} features after join to VAA")

    # Simplify data types for smaller files and faster IO
    df.FType = df.FType.astype("uint16")
    df.FCode = df.FCode.astype("uint16")
    df.StreamOrde = df.StreamOrde.astype("uint8")
    df.Slope = df.Slope.astype("float32")
    df.MinElevSmo = df.MinElevSmo.astype("float32")
    df.MaxElevSmo = df.MaxElevSmo.astype("float32")

    ### Read in flowline joins
    print("Reading flowline joins")
    join_df = read_dataframe(
        gdb_path,
        layer="NHDPlusFlow",
        read_geometry=False,
        columns=["FromNHDPID", "ToNHDPID"],
    ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
    join_df.upstream = join_df.upstream.astype("uint64")
    join_df.downstream = join_df.downstream.astype("uint64")

    ### Fix errors in NHD
    # some valid joins are marked as terminals (downstream==0) in NHD; we need
    # to backfill the missing join info.
    # To do this, we intersect all terminals back with flowlines dropping any
    # that are themselves terminals.  Then we calculate the distance to the upstream
    # point of the intersected line, and the upstream point of the next segment
    # downstream.  We use the ID of whichever one is closer (must be within 100m).
    ix = join_df.loc[join_df.downstream == 0].upstream.unique()
    # get last point, is furthest downstream
    tmp = df.loc[df.index.isin(ix), ["geometry"]].copy()
    tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1)

    target = df.loc[~df.index.isin(ix)]

    # only search against other flowlines
    tree = pg.STRtree(target.geometry.values.data)
    # search within a tolerance of 0.001, these are very very close
    left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001)

    pairs = pd.DataFrame(
        {
            "left": tmp.index.take(left),
            "right": target.index.take(right),
            "source": tmp.geometry.values.data.take(left),
            # take upstream / downstream points of matched lines
            "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0),
        }
    )

    # drop any pairs where the other side is also a terminal (these appear as
    # V shaped tiny networks that need to be left as is)
    pairs = pairs.loc[~pairs.right.isin(ix)]

    # calculate the next segment downstream (only keep the first if multiple; possible logic issue)
    next_downstream = (
        join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)]
        .groupby("upstream")
        .downstream.first()
    )
    pairs["next_downstream"] = pairs.right.map(next_downstream)
    pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point(
        df.loc[
            pairs.loc[pairs.next_downstream.notnull()].next_downstream
        ].geometry.values.data,
        0,
    )

    pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target)
    ix = pairs.next_downstream.notnull()
    pairs.loc[ix, "downstream_dist"] = pg.distance(
        pairs.loc[ix].source, pairs.loc[ix].downstream_target
    )

    # this ignores any nan
    pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1)
    # discard any that are too far (>100m)
    pairs = pairs.loc[pairs.dist <= 100].copy()

    # sort by distance to upstream point of matched flowline; this allows us
    # to sort on those then dedup to calculate a new downstream ID for this source line
    pairs = pairs.sort_values(by=["left", "dist"])

    # set the right value to the next downstream if it is closer
    # this also ignores na
    ix = pairs.downstream_dist < pairs.upstream_dist
    pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64")

    ids = pairs.groupby("left").right.first()

    if len(ids):
        # save to send to NHD
        pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv(
            f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False
        )

        ix = join_df.upstream.isin(ids.index)
        join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids)

        print(
            f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines"
        )

    # set join types to make it easier to track
    join_df["type"] = "internal"  # set default
    # upstream-most origin points
    join_df.loc[join_df.upstream == 0, "type"] = "origin"
    # downstream-most termination points
    join_df.loc[join_df.downstream == 0, "type"] = "terminal"

    ### Filter out coastlines and update joins
    # WARNING: we tried filtering out pipelines (FType == 428).  It doesn't work properly;
    # there are many that go through dams and are thus needed to calculate
    # network connectivity and gain of removing a dam.
    print("Filtering out coastlines...")
    coastline_idx = df.loc[df.FType == 566].index
    df = df.loc[~df.index.isin(coastline_idx)].copy()
    print(f"{len(df):,} features after removing coastlines")

    # remove any joins that have coastlines as upstream
    # these are themselves coastline segments
    join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()

    # set the downstream to 0 for any that join coastlines
    # this will enable us to mark these as downstream terminals in
    # the network analysis later
    join_df["marine"] = join_df.downstream.isin(coastline_idx)
    join_df.loc[join_df.marine, "downstream"] = 0
    join_df.loc[join_df.marine, "type"] = "terminal"

    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
    join_df = join_df.drop_duplicates(subset=["upstream", "downstream"])

    ### Filter out underground connectors
    ix = df.loc[df.FType == 420].index
    print("Removing {:,} underground conduits".format(len(ix)))
    df = df.loc[~df.index.isin(ix)].copy()
    join_df = remove_joins(
        join_df, ix, downstream_col="downstream", upstream_col="upstream"
    )

    ### Label loops for easier removal later
    # WARNING: loops may be very problematic from a network processing standpoint.
    # Include with caution.
    print("Identifying loops")
    df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull())

    idx = df.loc[df.loop].index
    join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx)

    ### Add calculated fields
    # Set our internal master IDs to the original index of the file we start from
    # Assume that we can always fit into a uint32, which is ~400 million records
    # and probably bigger than anything we could ever read in
    df["lineID"] = df.index.values.astype("uint32") + 1
    join_df = (
        join_df.join(df.lineID.rename("upstream_id"), on="upstream")
        .join(df.lineID.rename("downstream_id"), on="downstream")
        .fillna(0)
    )

    for col in ("upstream", "downstream"):
        join_df[col] = join_df[col].astype("uint64")

    for col in ("upstream_id", "downstream_id"):
        join_df[col] = join_df[col].astype("uint32")

    ### Calculate size classes
    print("Calculating size class")
    drainage = df.TotDASqKm
    df.loc[drainage < 10, "sizeclass"] = "1a"
    df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b"
    df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2"
    df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a"
    df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b"
    df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4"
    df.loc[drainage >= 25000, "sizeclass"] = "5"

    # Calculate length and sinuosity
    print("Calculating length and sinuosity")
    df["length"] = df.geometry.length.astype("float32")
    df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32")

    # drop columns not useful for later processing steps
    df = df.drop(columns=["FlowDir", "StreamCalc"])

    # calculate incoming joins (have valid upstream, but not in this HUC4)
    join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in"

    return df, join_df
Exemple #22
0
def snap_to_nhd_dams(df, to_snap):
    """Attempt to snap points from to_snap to NHD dams.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    print("Snapping to NHD dams...")
    # NOTE: id is not unique for points
    nhd_dams_poly = (
        from_geofeather(nhd_dir / "merged" / "nhd_dams_poly.feather")
        .rename(columns={"id": "damID"})
        .set_index("damID")
        .drop(columns=["index"], errors="ignore")
    )
    nhd_dams = (
        from_geofeather(nhd_dir / "merged" / "nhd_dams_pt.feather")
        .rename(columns={"id": "damID"})
        .set_index("damID")
        .drop(columns=["index"], errors="ignore")
    )
    # set nulls back to na
    nhd_dams.wbID = nhd_dams.wbID.replace(-1, np.nan)

    ### Find dams that are really close (50m) to NHD dam polygons
    # Those that have multiple dams nearby are usually part of a dam complex
    snap_start = time()
    near_nhd = nearest(
        to_snap.geometry, nhd_dams_poly.geometry, distance=NHD_DAM_TOLERANCE
    )[["damID"]]

    # snap to nearest dam point for that dam (some are > 1 km away)
    # NOTE: this will create multiple entries for some dams
    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        nhd_dams, on="damID"
    )
    near_nhd["snap_dist"] = pg.distance(near_nhd.geometry, near_nhd.source_pt)
    near_nhd = (
        near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first()
    )

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ", NHD_DAM_TOLERANCE, "m of NHD dam polygon"
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        "Snapped {:,} dams to NHD dam polygons in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    ### Find dams that are close (within snapping tolerance) of NHD dam points
    snap_start = time()
    tmp = nhd_dams.reset_index()  # reset index so we have unique index to join on
    near_nhd = nearest(
        to_snap.geometry, tmp.geometry, distance=to_snap.snap_tolerance
    ).rename(columns={"distance": "snap_dist"})

    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        tmp, on="index_right"
    )
    near_nhd = (
        near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first()
    )

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of NHD dam point but >",
        NHD_DAM_TOLERANCE,
        "m from NHD dam polygon",
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        "Snapped {:,} dams to NHD dam points in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    ### TODO: identify any NHD dam points that didn't get claimed  (need to do this after snapping others)

    return df, to_snap
Exemple #23
0
def snap_to_waterbodies(df, to_snap):
    """Attempt to snap points from to_snap to waterbody drain points.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    ### Attempt to snap to waterbody drain points for major waterbodies
    # Use larger tolerance for larger waterbodies
    print("Snapping to waterbodies and drain points..")
    wb = from_geofeather(nhd_dir / "merged" / "waterbodies.feather").set_index("wbID")
    drains = (
        from_geofeather(nhd_dir / "merged" / "waterbody_drain_points.feather")
        .rename(columns={"id": "drainID"})
        .set_index("drainID")
    )

    ### First pass - find the dams that are contained by waterbodies
    contained_start = time()

    in_wb = sjoin(to_snap, wb, how="inner").index_right.rename("wbID")

    # update wbID in dataset, but this doesn't mean it is snapped
    ix = in_wb.index
    df.loc[ix, "wbID"] = in_wb

    print(
        "Found {:,} dams in waterbodies in {:.2f}s".format(
            len(in_wb), time() - contained_start
        )
    )

    print("Finding nearest drain points...")
    snap_start = time()
    # join back to pygeos geoms and join to drains
    # NOTE: this may produce multiple drains for some waterbodies
    in_wb = (
        pd.DataFrame(in_wb)
        .join(to_snap[["geometry", "snap_tolerance"]])
        .join(
            drains.reset_index()
            .set_index("wbID")[["geometry", "drainID", "lineID"]]
            .rename(columns={"geometry": "drain"}),
            on="wbID",
        )
        .dropna(subset=["drain"])
    )
    in_wb["snap_dist"] = pg.distance(in_wb.geometry, in_wb.drain)

    # drop any that are > 500 m away, these aren't useful
    in_wb = in_wb.loc[in_wb.snap_dist <= 500].copy()

    # take the closest drain point
    in_wb.index.name = "index"
    in_wb = (
        in_wb.reset_index()
        .sort_values(by=["index", "snap_dist"])
        .groupby("index")
        .first()
    )

    # Any that are within the snap tolerance just snap to that drain
    close_enough = in_wb.loc[in_wb.snap_dist <= in_wb.snap_tolerance]
    ix = close_enough.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = close_enough.drain
    df.loc[ix, "snap_dist"] = close_enough.snap_dist
    df.loc[ix, "snap_ref_id"] = close_enough.drainID
    df.loc[ix, "lineID"] = close_enough.lineID
    df.loc[ix, "wbID"] = close_enough.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point for waterbody that contains this dam",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within tolerance of the drain points for their waterbody in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    # Any that are > tolerance away from their own drain, but within tolerance of another drain
    # should snap to the other drain; these are in chains of multiple waterbodies.
    # Visually confirmed this by looking at several.
    snap_start = time()
    further = in_wb.loc[in_wb.snap_dist > in_wb.snap_tolerance].copy()
    nearest_drains = nearest(further.geometry, drains.geometry, further.snap_tolerance)

    maybe_near_neighbor = further.join(nearest_drains, rsuffix="_nearest")

    ix = maybe_near_neighbor.loc[
        maybe_near_neighbor.distance < maybe_near_neighbor.snap_dist
    ].index
    near_neighbor = (
        (
            maybe_near_neighbor.loc[ix]
            .drop(columns=["drain", "drainID", "wbID", "lineID", "snap_dist"])
            .rename(columns={"drainID_nearest": "drainID", "distance": "snap_dist"})
            .join(
                drains[["geometry", "lineID", "wbID"]].rename(
                    columns={"geometry": "drain"}
                ),
                on="drainID",
            )
        )
        .sort_values(by="snap_dist")
        .groupby(level=0)
        .first()
    )

    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_neighbor.drain
    df.loc[ix, "snap_dist"] = near_neighbor.snap_dist
    df.loc[ix, "snap_ref_id"] = near_neighbor.drainID
    df.loc[ix, "lineID"] = near_neighbor.lineID
    df.loc[ix, "wbID"] = near_neighbor.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point for adjacent waterbody",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams close to drain points for an adjacent waterbody in {:.2f}s".format(
            len(ix), time() - snap_start
        )
    )

    # Any that remain and are < 250 in their waterbody snap to nearest drain
    further = further.loc[
        ~further.index.isin(ix) & (further.snap_dist <= WB_DRAIN_MAX_TOLERANCE)
    ].copy()

    ix = further.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = further.drain
    df.loc[ix, "snap_dist"] = further.snap_dist
    df.loc[ix, "snap_ref_id"] = further.drainID
    df.loc[ix, "lineID"] = further.lineID
    df.loc[ix, "wbID"] = further.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "-",
        WB_DRAIN_MAX_TOLERANCE,
        "m tolerance of drain point of waterbody that contains this dam",
    )
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within <{}m of the drain points for their waterbody".format(
            len(ix), WB_DRAIN_MAX_TOLERANCE
        )
    )

    ### Find the ones that are not in a waterbody but within tolerance of a drain
    # Visually inspected several that had multiple waterbodies nearby
    # in all cases, the nearest one was sufficient
    print("Finding nearest waterbody drains for unsnapped dams...")
    snap_start = time()
    nearest_drains = nearest(to_snap.geometry, drains.geometry, to_snap.snap_tolerance)

    nearest_drains = nearest_drains.join(to_snap.geometry).join(
        drains[["geometry", "wbID", "lineID"]].rename(columns={"geometry": "drain"}),
        on="drainID",
    )

    ix = nearest_drains.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = nearest_drains.drain
    df.loc[ix, "snap_dist"] = nearest_drains.distance
    df.loc[ix, "snap_ref_id"] = nearest_drains.drainID
    df.loc[ix, "lineID"] = nearest_drains.lineID
    df.loc[ix, "wbID"] = nearest_drains.wbID

    df.loc[ix, "snap_log"] = ndarray_append_strings(
        "snapped: within ",
        to_snap.loc[ix].snap_tolerance,
        "m tolerance of drain point of waterbody (dam not in waterbody)",
    )

    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

    print(
        "Found {:,} dams within {}m of waterbody drain points".format(
            len(ix), to_snap.snap_tolerance.max()
        )
    )

    # TODO: need to track which waterbodies were claimed by dams

    return df, to_snap
def snap_to_nhd_dams(df, to_snap):
    """Attempt to snap points from to_snap to NHD dams.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    snap_start = time()

    print("=================\nSnapping to NHD dams...")

    nhd_dams_poly = gp.read_feather(
        nhd_dir / "merged" / "nhd_dams_poly.feather",
        columns=["damID", "geometry"]).set_index("damID")

    # NOTE: there may be multiple points per damID
    nhd_dams = gp.read_feather(
        nhd_dir / "merged" / "nhd_dams_pt.feather",
        columns=["damID", "wbID", "lineID", "loop", "sizeclass", "geometry"],
    ).set_index("damID")
    # set nulls back to na
    nhd_dams.wbID = nhd_dams.wbID.replace(-1, np.nan)

    ### Find dams that are really close (50m) to NHD dam polygons
    near_nhd_pt = nearest(
        pd.Series(to_snap.geometry.values.data, index=to_snap.index),
        pd.Series(nhd_dams.geometry.values.data, index=nhd_dams.index),
        max_distance=NHD_DAM_TOLERANCE,
    )[["damID"]]
    near_nhd_pt = near_nhd_pt.join(to_snap.geometry.rename("source_pt")).join(
        nhd_dams, on="damID")
    near_nhd_pt.reset_index().drop_duplicates(
        subset=["id", "damID", "lineID", "geometry"]).set_index("id")
    near_nhd_pt["snap_dist"] = pg.distance(near_nhd_pt.geometry.values.data,
                                           near_nhd_pt.source_pt.values.data)
    # take the largest, nonloop
    near_nhd_pt = (near_nhd_pt.reset_index().sort_values(
        by=["id", "sizeclass", "loop", "snap_dist"],
        ascending=[True, False, True, True],
    ).groupby("id").first())

    ix = near_nhd_pt.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd_pt.geometry
    df.loc[ix, "snap_dist"] = near_nhd_pt.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd_pt.damID
    df.loc[ix, "lineID"] = near_nhd_pt.lineID
    df.loc[ix, "wbID"] = near_nhd_pt.wbID
    df.loc[
        ix,
        "snap_log"] = f"snapped: within {NHD_DAM_TOLERANCE}m tolerance of NHD dam point"
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        f"Snapped {len(ix):,} dams within {NHD_DAM_TOLERANCE} to NHD dam points in {time() - snap_start:.2f}s"
    )

    ### Find dams that are really close (50m) to NHD dam polygons
    # Those that have multiple dams nearby are usually part of a dam complex
    near_nhd = near(
        pd.Series(to_snap.geometry.values.data, index=to_snap.index),
        pd.Series(nhd_dams_poly.geometry.values.data,
                  index=nhd_dams_poly.index),
        distance=NHD_DAM_TOLERANCE,
    )[["damID"]]

    # snap to nearest dam point for that dam (some are > 1 km away)
    # NOTE: this will create multiple entries for some dams; the closest is used
    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        nhd_dams, on="damID")
    near_nhd.reset_index().drop_duplicates(
        subset=["id", "damID", "lineID", "geometry"]).set_index("id")
    near_nhd["snap_dist"] = pg.distance(near_nhd.geometry.values.data,
                                        near_nhd.source_pt.values.data)
    # Sort to prioritize larger size classes and non-loops, then distance
    # this also drops duplicates
    near_nhd = (near_nhd.reset_index().sort_values(
        by=["id", "sizeclass", "loop", "snap_dist"],
        ascending=[True, False, True, True],
    ).groupby("id").first())

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[ix, "snap_log"] = ndarray_append_strings("snapped: within ",
                                                    NHD_DAM_TOLERANCE,
                                                    "m of NHD dam polygon")
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print("Snapped {:,} dams to NHD dam polygons in {:.2f}s".format(
        len(ix),
        time() - snap_start))

    ### Find dams that are close (within snapping tolerance) of NHD dam points
    # most of these should have been picked up above, but this picks up ones that are
    # greater than NHD_DAM_TOLERANCE away due to bad locations
    snap_start = time()
    tmp = nhd_dams.reset_index(
    )  # reset index so we have unique index to join on
    near_nhd = nearest(
        pd.Series(to_snap.geometry.values.data, index=to_snap.index),
        pd.Series(tmp.geometry.values.data, index=tmp.index),
        max_distance=np.clip(to_snap.snap_tolerance.values, 0,
                             NHD_DAM_PT_TOLERANCE),
    ).rename(columns={"distance": "snap_dist"})

    near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join(
        tmp, on="index_right")
    near_nhd = (near_nhd.reset_index().sort_values(
        by=["id", "sizeclass", "loop", "snap_dist"],
        ascending=[True, False, True, True],
    ).groupby("id").first())

    ix = near_nhd.index
    df.loc[ix, "snapped"] = True
    df.loc[ix, "geometry"] = near_nhd.geometry
    df.loc[ix, "snap_dist"] = near_nhd.snap_dist
    df.loc[ix, "snap_ref_id"] = near_nhd.damID
    df.loc[ix, "lineID"] = near_nhd.lineID
    df.loc[ix, "wbID"] = near_nhd.wbID
    df.loc[
        ix,
        "snap_log"] = f"snapped: within {NHD_DAM_PT_TOLERANCE}m tolerance of NHD dam point but >{NHD_DAM_TOLERANCE}m from NHD dam polygon"
    to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()
    print(
        f"Snapped {len(ix):,} dams to NHD dam points in {time() - snap_start:.2f}s"
    )

    return df, to_snap
def snap_to_waterbodies(df, to_snap):
    """Attempt to snap points from to_snap to waterbody drain points.

    Updates df with snapping results, and returns to_snap as set of dams still
    needing to be snapped after this operation.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """

    ### Attempt to snap to waterbody drain points for major waterbodies
    # Use larger tolerance for larger waterbodies
    # NOTE: this specifically excludes known lowhead dams from snapping to waterbodies
    print("=================\nSnapping to waterbodies and drain points..")

    for huc2 in sorted(to_snap.HUC2.unique()):
        print(f"\n----- {huc2} ------")
        in_huc2 = to_snap.loc[(to_snap.HUC2 == huc2)
                              & (to_snap.LowheadDam != 1)].copy()

        wb = gp.read_feather(
            nhd_dir / "clean" / huc2 / "waterbodies.feather",
            columns=["wbID", "geometry"],
        ).set_index("wbID")
        drains = gp.read_feather(
            nhd_dir / "clean" / huc2 / "waterbody_drain_points.feather",
            columns=[
                "drainID", "wbID", "lineID", "loop", "sizeclass", "geometry"
            ],
        ).set_index("drainID")

        print(
            f"HUC {huc2} selected {len(in_huc2):,} barriers in region to snap against {len(wb):,} waterbodies"
        )

        ### First pass - find the dams that are contained by waterbodies
        contained_start = time()

        # Join to nearest waterbodies within 1m (basically inside)
        # and keep only the first match
        tree = pg.STRtree(wb.geometry.values.data)
        left, right = tree.nearest_all(in_huc2.geometry.values.data,
                                       max_distance=1)
        in_wb = (pd.DataFrame({
            "id": in_huc2.index.values.take(left),
            "wbID": wb.index.values.take(right),
        }).groupby("id").first())

        in_wb_index = in_wb.index

        # update wbID in dataset, but this doesn't mean it is snapped
        df.loc[in_wb.index, "wbID"] = in_wb.wbID

        print(
            f"Found {len(in_wb):,} dams in waterbodies in {time() - contained_start:.2f}s"
        )

        print("Finding nearest drain points...")
        snap_start = time()

        # join back to pygeos geoms and join to drains
        # NOTE: this may bring in multiple drains for some waterbodies, we take the
        # closest drain below
        in_wb = (in_wb.join(to_snap[["geometry", "snap_tolerance"]]).join(
            drains.reset_index().set_index("wbID")[[
                "drainID", "lineID", "loop", "sizeclass", "geometry"
            ]].rename(columns={"geometry": "drain"}),
            on="wbID",
        ).dropna(subset=["drain"]))
        in_wb["snap_dist"] = pg.distance(in_wb.geometry.values.data,
                                         in_wb.drain.values.data)

        # sort drains by largest size class, nonloop, then descending distance
        in_wb = (in_wb.loc[
            in_wb.snap_dist <= in_wb.snap_tolerance].reset_index().sort_values(
                by=["sizeclass", "loop", "snap_dist"],
                ascending=[False, True, True],
            ).groupby("id").first())

        # Any that are within the snap tolerance just snap to that drain
        ix = in_wb.index
        df.loc[ix, "snapped"] = True
        df.loc[ix, "geometry"] = in_wb.drain
        df.loc[ix, "snap_dist"] = in_wb.snap_dist
        df.loc[ix, "snap_ref_id"] = in_wb.drainID
        df.loc[ix, "lineID"] = in_wb.lineID
        df.loc[ix, "wbID"] = in_wb.wbID
        df.loc[ix, "snap_log"] = ndarray_append_strings(
            "snapped: within ",
            to_snap.loc[ix].snap_tolerance,
            "m tolerance of drain point for waterbody that contains this dam",
        )

        to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

        print(
            f"Found {len(ix):,} dams within tolerance of the drain points for their waterbody in {time() - snap_start:.2f}s"
        )

        ### Find the ones that are not in a waterbody but within tolerance of a drain
        # Visually inspected several that had multiple waterbodies nearby
        # in all cases, the nearest waterbody was sufficient
        print("Finding nearest waterbody drains for unsnapped dams...")
        snap_start = time()

        # only snap those that are not in waterbodies
        not_in_wb = in_huc2.loc[~in_huc2.index.isin(in_wb_index.unique()
                                                    )].copy()

        nearest_drains = nearest(
            pd.Series(not_in_wb.geometry.values.data, index=not_in_wb.index),
            pd.Series(drains.geometry.values.data, index=drains.index),
            max_distance=np.clip(not_in_wb.snap_tolerance.values, 0,
                                 WB_DRAIN_MAX_TOLERANCE),
        )

        # join in all drains for waterbody of nearest drain point
        nearest_drains = (nearest_drains.drop(columns=["distance"]).join(
            not_in_wb[["geometry", "snap_tolerance"]]).join(
                drains.wbID,
                on="drainID",
            ).drop(columns=["drainID"]).join(
                drains.reset_index().set_index("wbID")[[
                    "geometry", "drainID", "lineID", "loop", "sizeclass"
                ]].rename(columns={"geometry": "drain"}),
                on="wbID",
            ))

        nearest_drains["snap_dist"] = pg.distance(
            nearest_drains.geometry.values.data,
            nearest_drains.drain.values.data)
        # take the nearest, largest non-loop drain point within tolerance
        nearest_drains = (
            nearest_drains.loc[nearest_drains.snap_dist <
                               nearest_drains.snap_tolerance].sort_values(
                                   by=["sizeclass", "loop", "snap_dist"],
                                   ascending=[False, True, True],
                               ).groupby(level=0).first())

        ix = nearest_drains.index
        df.loc[ix, "snapped"] = True
        df.loc[ix, "geometry"] = nearest_drains.drain
        df.loc[ix, "snap_dist"] = nearest_drains.snap_dist
        df.loc[ix, "snap_ref_id"] = nearest_drains.drainID
        df.loc[ix, "lineID"] = nearest_drains.lineID
        df.loc[ix, "wbID"] = nearest_drains.wbID

        df.loc[
            ix,
            "snap_log"] = f"snapped: within {WB_DRAIN_MAX_TOLERANCE}m or less of drain point of waterbody (dam not in waterbody)"

        to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

        print(
            f"Found {len(ix):,} dams within {WB_DRAIN_MAX_TOLERANCE}m or less of waterbody drain points"
        )

    return df, to_snap
def snap_estimated_dams_to_drains(df, to_snap):
    """Snap estimated dams to waterbody drain points.

    Dams that were estimated from waterbodies are snapped to the nearest drain
    points (should be very small snap_dist).

    Other estimated dams often occur inside / immediately adjacent to waterbodies
    and are snapped to the nearest drain point of those waterbodies if < 2km.

    Parameters
    ----------
    df : GeoDataFrame
        master dataset, this is where all snapping gets recorded
    to_snap : DataFrame
        data frame containing pygeos geometries to snap ("geometry")
        and snapping tolerance ("snap_tolerance")

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (df, to_snap)
    """
    snap_start = time()

    # if estimated dam and was not manually reviewed and moved or verified at correct location
    ix = (to_snap.snap_group.isin([1, 3
                                   ])) & (~to_snap.ManualReview.isin([4, 13]))
    estimated = to_snap.loc[ix].copy()
    print(f"=================\nSnapping {len(estimated):,} estimated dams...")

    for huc2 in sorted(estimated.HUC2.unique()):
        wb = gp.read_feather(
            nhd_dir / "clean" / huc2 / "waterbodies.feather",
            columns=["wbID", "geometry"],
        ).set_index("wbID")

        drains = gp.read_feather(
            nhd_dir / "clean" / huc2 / "waterbody_drain_points.feather",
            columns=["drainID", "wbID", "lineID", "geometry"],
        ).set_index("drainID")

        in_huc2 = estimated.loc[estimated.HUC2 == huc2].copy()

        # most estimated dams were originally derived from waterbody drain points,
        # so process those first
        tmp = in_huc2.loc[in_huc2.snap_group == 3]
        if len(tmp):
            max_drain_dist = tmp.snap_tolerance.unique()[0]
            tree = pg.STRtree(drains.geometry.values.data)
            left, right = tree.nearest_all(tmp.geometry.values.data,
                                           max_distance=max_drain_dist)
            drain_joins = (pd.DataFrame({
                "id":
                tmp.index.values.take(left),
                "geometry":
                tmp.geometry.values.take(left),
                "drainID":
                drains.index.values.take(right),
                "drain":
                drains.geometry.values.take(right),
                "wbID":
                drains.wbID.values.take(right),
                "lineID":
                drains.lineID.values.take(right),
            }).groupby("id").first())

            drain_joins["snap_dist"] = pg.distance(
                drain_joins.geometry.values.data,
                drain_joins.drain.values.data)

            ix = drain_joins.index
            df.loc[ix, "snapped"] = True
            df.loc[ix, "geometry"] = drain_joins.drain
            df.loc[ix, "snap_dist"] = drain_joins.snap_dist
            df.loc[ix, "snap_ref_id"] = drain_joins.drainID
            df.loc[ix, "lineID"] = drain_joins.lineID
            df.loc[ix, "wbID"] = drain_joins.wbID
            df.loc[
                ix,
                "snap_log"] = "snapped: dams estimated from waterbody snapped to nearest drain point"

            to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

            print(
                f"HUC {huc2}: snapped {len(drain_joins):,} of {len(drain_joins):,} dams estimated from waterbodies in region to waterbody drain points"
            )

        in_huc2 = in_huc2.loc[in_huc2.snap_group == 1]

        # Some estimated dams are just barely outside their waterbodies
        # so we take the nearest waterbody for each, within a tolerance of 1m
        tree = pg.STRtree(wb.geometry.values.data)
        left, right = tree.nearest_all(in_huc2.geometry.values.data,
                                       max_distance=1)
        # take the first in case of duplicates
        in_wb = (pd.DataFrame({
            "id": in_huc2.index.values.take(left),
            "wbID": wb.index.values.take(right),
        }).groupby("id").first().join(in_huc2.geometry).join(
            drains[["wbID", "lineID",
                    "geometry"]].reset_index().set_index("wbID").rename(
                        columns={"geometry": "drain"}),
            on="wbID",
        ))

        in_wb["snap_dist"] = pg.distance(in_wb.geometry.values.data,
                                         in_wb.drain.values.data)
        grouped = in_wb.sort_values(by="snap_dist").groupby(level=0)
        in_wb = grouped.first()

        # any waterbodies that have > 2 drains are dubious fits; remove them
        s = grouped.size()
        ix = s[s > 2].index
        in_wb = in_wb.loc[~in_wb.index.isin(ix)].copy()

        # any that are >2,000m away are likely incorrect; some ones near that length are OK
        in_wb = in_wb.loc[in_wb.snap_dist <= 2000].copy()

        ix = in_wb.index
        df.loc[ix, "snapped"] = True
        df.loc[ix, "geometry"] = in_wb.drain
        df.loc[ix, "snap_dist"] = in_wb.snap_dist
        df.loc[ix, "snap_ref_id"] = in_wb.drainID
        df.loc[ix, "lineID"] = in_wb.lineID
        df.loc[ix, "wbID"] = in_wb.wbID
        df.loc[
            ix,
            "snap_log"] = "snapped: estimated dam in waterbody snapped to nearest drain point"

        to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy()

        print(
            f"HUC {huc2}: snapped {len(in_wb):,} of {len(in_huc2):,} estimated dams in region to waterbody drain points"
        )

    print(
        f"Snapped {len(df.loc[df.snap_log.str.startswith('snapped: estimated dam')]):,} estimated dams to waterbody drain points in {time() - snap_start:.2f}s"
    )

    return df, to_snap