Exemple #1
0
def calculate_sinuosity(geometries):
    """Calculate sinuosity of the line.

    This is the length of the line divided by the distance between the endpoints of the line.
    By definition, it is always >=1.

    Parameters
    ----------
    geometries : Series or ndarray of pygeos geometries

    Returns
    -------
    Series or ndarray
        sinuosity values
    """

    # By definition, sinuosity should not be less than 1
    first = pg.get_point(geometries, 0)
    last = pg.get_point(geometries, -1)
    straight_line_distance = pg.distance(first, last)

    sinuosity = np.ones((len(geometries), )).astype("float32")

    # if there is no straight line distance there can be no sinuosity
    ix = straight_line_distance > 0

    # by definition, all values must be at least 1, so clip lower bound
    sinuosity[ix] = (pg.length(geometries[ix]) /
                     straight_line_distance).clip(1)

    if isinstance(geometries, pd.Series):
        return pd.Series(sinuosity, index=geometries.index)

    return sinuosity
def cut_line_at_points(line, cut_points, tolerance=1e-6):
    """Cut a pygeos line geometry at points.
    If there are no interior points, the original line will be returned.

    Parameters
    ----------
    line : pygeos Linestring
    cut_points : list-like of pygeos Points
        will be projected onto the line; those interior to the line will be
        used to cut the line in to new segments.
    tolerance : float, optional (default: 1e-6)
        minimum distance from endpoints to consider the points interior
        to the line.

    Returns
    -------
    MultiLineStrings (or LineString, if unchanged)
    """
    if not pg.get_type_id(line) == 1:
        raise ValueError("line is not a single linestring")

    vertices = pg.get_point(line, range(pg.get_num_points(line)))
    offsets = pg.line_locate_point(line, vertices)
    cut_offsets = pg.line_locate_point(line, cut_points)
    # only keep those that are interior to the line and ignore those very close
    # to endpoints or beyond endpoints
    cut_offsets = cut_offsets[(cut_offsets > tolerance)
                              & (cut_offsets < offsets[-1] - tolerance)]

    if len(cut_offsets) == 0:
        # nothing to cut, return original
        return line

    # get coordinates of new vertices from the cut points (interpolated onto the line)
    cut_offsets.sort()

    # add in the last coordinate of the line
    cut_offsets = np.append(cut_offsets, offsets[-1])

    # TODO: convert this to a pygos ufunc
    coords = pg.get_coordinates(line)
    cut_coords = pg.get_coordinates(
        pg.line_interpolate_point(line, cut_offsets))
    lines = []
    orig_ix = 0
    for cut_ix in range(len(cut_offsets)):
        offset = cut_offsets[cut_ix]

        segment = []
        if cut_ix > 0:
            segment = [cut_coords[cut_ix - 1]]
        while offsets[orig_ix] < offset:
            segment.append(coords[orig_ix])
            orig_ix += 1

        segment.append(cut_coords[cut_ix])
        lines.append(pg.linestrings(segment))

    return pg.multilinestrings(lines)
    )

    # Now can just reduce dams back to these lineIDs
    dams = (
        dams[["damID", "geometry"]]
        .join(downstreams, on="damID", how="inner")
        .drop_duplicates(subset=["damID", "lineID"])
        .join(flowlines.geometry.rename("flowline"), on="lineID",)
        .reset_index(drop=True)
    )
    print(f"Found {len(dams):,} joins between NHD dams and flowlines")

    ### Extract representative point
    # Look at either end of overlapping line and use that as representative point.
    # Otherwise intersect and extract first coordinate of overlapping line
    last_pt = pg.get_point(dams.flowline.values.data, -1)
    ix = pg.intersects(dams.geometry.values.data, last_pt)
    dams.loc[ix, "pt"] = last_pt[ix]

    # override with upstream most point when both intersect
    first_pt = pg.get_point(dams.flowline.values.data, 0)
    ix = pg.intersects(dams.geometry.values.data, first_pt)
    dams.loc[ix, "pt"] = first_pt[ix]

    ix = dams.pt.isnull()
    # WARNING: this might fail for odd intersection geoms; we always take the first line
    # below
    pt = pd.Series(
        pg.get_point(
            pg.get_geometry(
                pg.intersection(
def find_dam_face_from_waterbody(waterbody, drain_pt):
    total_area = pg.area(waterbody)
    ring = pg.get_exterior_ring(pg.normalize(waterbody))
    total_length = pg.length(ring)
    num_pts = pg.get_num_points(ring) - 1  # drop closing coordinate
    vertices = pg.get_point(ring, range(num_pts))

    ### Extract line segments that are no more than 1/3 coordinates of polygon
    # starting from the vertex nearest the drain
    # note: lower numbers are to the right
    tree = pg.STRtree(vertices)
    ix = tree.nearest(drain_pt)[1][0]
    side_width = min(num_pts // 3, MAX_SIDE_PTS)
    left_ix = ix + side_width
    right_ix = ix - side_width

    # extract these as a left-to-write line;
    pts = vertices[max(right_ix, 0):min(num_pts, left_ix)][::-1]
    if left_ix >= num_pts:
        pts = np.append(vertices[0:left_ix - num_pts][::-1], pts)

    if right_ix < 0:
        pts = np.append(pts, vertices[num_pts + right_ix:num_pts][::-1])

    coords = pg.get_coordinates(pts)

    if len(coords) > 2:
        # first run a simplification process to extract the major shape and bends
        # then run the straight line algorithm
        simp_coords, simp_ix = simplify_vw(
            coords, min(MAX_SIMPLIFY_AREA, total_area / 100))

        if len(simp_coords) > 2:
            keep_coords, ix = extract_straight_segments(
                simp_coords, max_angle=MAX_STRAIGHT_ANGLE, loops=5)
            keep_ix = simp_ix.take(ix)

        else:
            keep_coords = simp_coords
            keep_ix = simp_ix

    else:
        keep_coords = coords
        keep_ix = np.arange(len(coords))

    ### Calculate the length of each run and drop any that are not sufficiently long
    lengths = segment_length(keep_coords)
    ix = (lengths >= MIN_DAM_WIDTH) & (lengths / total_length <
                                       MAX_WIDTH_RATIO)

    pairs = np.dstack([keep_ix[:-1][ix], keep_ix[1:][ix]])[0]

    # since ranges are ragged, we have to do this in a loop instead of vectorized
    segments = []
    for start, end in pairs:
        segments.append(pg.linestrings(coords[start:end + 1]))

    segments = np.array(segments)

    # only keep the segments that are close to the drain
    segments = segments[
        pg.intersects(segments, pg.buffer(drain_pt, MAX_DRAIN_DIST)), ]

    if not len(segments):
        return segments

    # only keep those where the drain is interior to the line
    pos = pg.line_locate_point(segments, drain_pt)
    lengths = pg.length(segments)

    ix = (pos >= MIN_INTERIOR_DIST) & (pos <= (lengths - MIN_INTERIOR_DIST))

    return segments[ix]
Exemple #5
0
def test_get_point_non_linestring(geom):
    actual = pygeos.get_point(geom, [0, 2, -1])
    assert pygeos.is_missing(actual).all()
Exemple #6
0
def test_get_point(geom):
    n = pygeos.get_num_points(geom)
    actual = pygeos.get_point(geom, [0, -n, n, -(n + 1)])
    assert pygeos.equals(actual[0], actual[1]).all()
    assert pygeos.is_missing(actual[2:4]).all()
Exemple #7
0
def test_get_point():
    actual = pygeos.get_point(line_string, 1)
    assert pygeos.equals(actual, pygeos.points(1, 0))
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
    """
    Extracts flowlines data from NHDPlusHR data product.
    Extract flowlines from NHDPlusHR data product, joins to VAA table,
    and filters out coastlines.
    Extracts joins between flowlines, and filters out coastlines.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.
    extra_cols: list
        List of extra field names to extract from NHDFlowline layer

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (flowlines, joins)
    """

    ### Read in flowline data and convert to data frame
    print("Reading flowlines")
    flowline_cols = FLOWLINE_COLS + extra_flowline_cols
    df = read_dataframe(
        gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols],
    )

    # Index on NHDPlusID for easy joins to other NHD data
    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df = df.set_index(["NHDPlusID"], drop=False)

    # convert MultiLineStrings to LineStrings (all have a single linestring)
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)

    print("making valid and projecting to target projection")
    df.geometry = make_valid(df.geometry.values.data)
    df = df.to_crs(target_crs)
    print(f"Read {len(df):,} flowlines")

    ### Read in VAA and convert to data frame
    # NOTE: not all records in Flowlines have corresponding records in VAA
    # we drop those that do not since we need these fields.
    print("Reading VAA table and joining...")
    vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS])

    vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64")
    vaa_df = vaa_df.set_index(["NHDPlusID"])
    df = df.join(vaa_df, how="inner")
    print(f"{len(df):,} features after join to VAA")

    # Simplify data types for smaller files and faster IO
    df.FType = df.FType.astype("uint16")
    df.FCode = df.FCode.astype("uint16")
    df.StreamOrde = df.StreamOrde.astype("uint8")
    df.Slope = df.Slope.astype("float32")
    df.MinElevSmo = df.MinElevSmo.astype("float32")
    df.MaxElevSmo = df.MaxElevSmo.astype("float32")

    ### Read in flowline joins
    print("Reading flowline joins")
    join_df = read_dataframe(
        gdb_path,
        layer="NHDPlusFlow",
        read_geometry=False,
        columns=["FromNHDPID", "ToNHDPID"],
    ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
    join_df.upstream = join_df.upstream.astype("uint64")
    join_df.downstream = join_df.downstream.astype("uint64")

    ### Fix errors in NHD
    # some valid joins are marked as terminals (downstream==0) in NHD; we need
    # to backfill the missing join info.
    # To do this, we intersect all terminals back with flowlines dropping any
    # that are themselves terminals.  Then we calculate the distance to the upstream
    # point of the intersected line, and the upstream point of the next segment
    # downstream.  We use the ID of whichever one is closer (must be within 100m).
    ix = join_df.loc[join_df.downstream == 0].upstream.unique()
    # get last point, is furthest downstream
    tmp = df.loc[df.index.isin(ix), ["geometry"]].copy()
    tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1)

    target = df.loc[~df.index.isin(ix)]

    # only search against other flowlines
    tree = pg.STRtree(target.geometry.values.data)
    # search within a tolerance of 0.001, these are very very close
    left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001)

    pairs = pd.DataFrame(
        {
            "left": tmp.index.take(left),
            "right": target.index.take(right),
            "source": tmp.geometry.values.data.take(left),
            # take upstream / downstream points of matched lines
            "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0),
        }
    )

    # drop any pairs where the other side is also a terminal (these appear as
    # V shaped tiny networks that need to be left as is)
    pairs = pairs.loc[~pairs.right.isin(ix)]

    # calculate the next segment downstream (only keep the first if multiple; possible logic issue)
    next_downstream = (
        join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)]
        .groupby("upstream")
        .downstream.first()
    )
    pairs["next_downstream"] = pairs.right.map(next_downstream)
    pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point(
        df.loc[
            pairs.loc[pairs.next_downstream.notnull()].next_downstream
        ].geometry.values.data,
        0,
    )

    pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target)
    ix = pairs.next_downstream.notnull()
    pairs.loc[ix, "downstream_dist"] = pg.distance(
        pairs.loc[ix].source, pairs.loc[ix].downstream_target
    )

    # this ignores any nan
    pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1)
    # discard any that are too far (>100m)
    pairs = pairs.loc[pairs.dist <= 100].copy()

    # sort by distance to upstream point of matched flowline; this allows us
    # to sort on those then dedup to calculate a new downstream ID for this source line
    pairs = pairs.sort_values(by=["left", "dist"])

    # set the right value to the next downstream if it is closer
    # this also ignores na
    ix = pairs.downstream_dist < pairs.upstream_dist
    pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64")

    ids = pairs.groupby("left").right.first()

    if len(ids):
        # save to send to NHD
        pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv(
            f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False
        )

        ix = join_df.upstream.isin(ids.index)
        join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids)

        print(
            f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines"
        )

    # set join types to make it easier to track
    join_df["type"] = "internal"  # set default
    # upstream-most origin points
    join_df.loc[join_df.upstream == 0, "type"] = "origin"
    # downstream-most termination points
    join_df.loc[join_df.downstream == 0, "type"] = "terminal"

    ### Filter out coastlines and update joins
    # WARNING: we tried filtering out pipelines (FType == 428).  It doesn't work properly;
    # there are many that go through dams and are thus needed to calculate
    # network connectivity and gain of removing a dam.
    print("Filtering out coastlines...")
    coastline_idx = df.loc[df.FType == 566].index
    df = df.loc[~df.index.isin(coastline_idx)].copy()
    print(f"{len(df):,} features after removing coastlines")

    # remove any joins that have coastlines as upstream
    # these are themselves coastline segments
    join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()

    # set the downstream to 0 for any that join coastlines
    # this will enable us to mark these as downstream terminals in
    # the network analysis later
    join_df["marine"] = join_df.downstream.isin(coastline_idx)
    join_df.loc[join_df.marine, "downstream"] = 0
    join_df.loc[join_df.marine, "type"] = "terminal"

    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
    join_df = join_df.drop_duplicates(subset=["upstream", "downstream"])

    ### Filter out underground connectors
    ix = df.loc[df.FType == 420].index
    print("Removing {:,} underground conduits".format(len(ix)))
    df = df.loc[~df.index.isin(ix)].copy()
    join_df = remove_joins(
        join_df, ix, downstream_col="downstream", upstream_col="upstream"
    )

    ### Label loops for easier removal later
    # WARNING: loops may be very problematic from a network processing standpoint.
    # Include with caution.
    print("Identifying loops")
    df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull())

    idx = df.loc[df.loop].index
    join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx)

    ### Add calculated fields
    # Set our internal master IDs to the original index of the file we start from
    # Assume that we can always fit into a uint32, which is ~400 million records
    # and probably bigger than anything we could ever read in
    df["lineID"] = df.index.values.astype("uint32") + 1
    join_df = (
        join_df.join(df.lineID.rename("upstream_id"), on="upstream")
        .join(df.lineID.rename("downstream_id"), on="downstream")
        .fillna(0)
    )

    for col in ("upstream", "downstream"):
        join_df[col] = join_df[col].astype("uint64")

    for col in ("upstream_id", "downstream_id"):
        join_df[col] = join_df[col].astype("uint32")

    ### Calculate size classes
    print("Calculating size class")
    drainage = df.TotDASqKm
    df.loc[drainage < 10, "sizeclass"] = "1a"
    df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b"
    df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2"
    df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a"
    df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b"
    df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4"
    df.loc[drainage >= 25000, "sizeclass"] = "5"

    # Calculate length and sinuosity
    print("Calculating length and sinuosity")
    df["length"] = df.geometry.length.astype("float32")
    df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32")

    # drop columns not useful for later processing steps
    df = df.drop(columns=["FlowDir", "StreamCalc"])

    # calculate incoming joins (have valid upstream, but not in this HUC4)
    join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in"

    return df, join_df
def remove_marine_flowlines(flowlines, joins, marine):
    """Remove flowlines that originate within or are mostly within marine areas
    for coastal HUC2s.  Marks any that have endpoints in marine areas or are
    upstream of those removed here as terminating in marine.

    Parameters
    ----------
    flowlines : GeoDataFrame
    joins : DataFrame
    marine : GeoDataFrame

    Returns
    -------
    (GeoDataFrame, DataFrame)
        flowlines, joins
    """

    # Remove those that start in marine areas
    points = pg.get_point(flowlines.geometry.values.data, 0)
    tree = pg.STRtree(points)
    left, right = tree.query_bulk(marine.geometry.values.data,
                                  predicate="intersects")
    ix = flowlines.index.take(np.unique(right))

    print(f"Removing {len(ix):,} flowlines that originate in marine areas")
    # mark any that terminated in those as marine
    joins.loc[joins.downstream_id.isin(ix), "marine"] = True
    flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy()
    joins = remove_joins(joins,
                         ix,
                         downstream_col="downstream_id",
                         upstream_col="upstream_id")

    # Mark those that end in marine areas as marine
    endpoints = pg.get_point(flowlines.geometry.values.data, -1)
    tree = pg.STRtree(endpoints)
    left, right = tree.query_bulk(marine.geometry.values.data,
                                  predicate="intersects")
    ix = flowlines.index.take(np.unique(right))
    joins.loc[joins.upstream_id.isin(ix), "marine"] = True

    # For any that end in marine but didn't originate there, check the amount of overlap;
    # any that are >= 90% in marine should get cut
    print("Calculating overlap of remaining lines with marine areas")
    tmp = pd.DataFrame({
        "lineID": flowlines.iloc[right].index,
        "geometry": flowlines.iloc[right].geometry.values.data,
        "marine": marine.iloc[left].geometry.values.data,
    })
    tmp["overlap"] = pg.intersection(tmp.geometry, tmp.marine)
    tmp["pct_overlap"] = 100 * pg.length(tmp.overlap) / pg.length(tmp.geometry)

    ix = tmp.loc[tmp.pct_overlap >= 90].lineID.unique()

    print(f"Removing {len(ix):,} flowlines that mostly overlap marine areas")
    # mark any that terminated in those as marine
    joins.loc[joins.downstream_id.isin(ix), "marine"] = True
    flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy()
    joins = remove_joins(joins,
                         ix,
                         downstream_col="downstream_id",
                         upstream_col="upstream_id")

    return flowlines, joins
def create_drain_points(flowlines, joins, waterbodies, wb_joins):
    """Create drain points from furthest downstream point of flowlines that overlap with waterbodies.

    WARNING: If multiple flowlines intersect at the drain point, there will be multiple drain points at the same location

    Parameters
    ----------
    flowlines : GeoDataFrame
    joins : DataFrame
        flowline joins
    waterbodies : GeoDataFrame
    wb_joins : DataFrame
        waterbody / flowline joins

    Returns
    -------
    GeoDataFrame
        Drain points dataframe
    """
    start = time()

    wb_atts = waterbodies[["altered", "km2", "flowlineLength"]].copy()

    tmp_flowlines = flowlines[[
        "geometry",
        "FCode",
        "FType",
        "MaxElevSmo",
        "MinElevSmo",
        "Slope",
        "TotDASqKm",
        "StreamOrde",
        "sizeclass",
        "HUC4",
        "loop",
    ]].rename(columns={
        "FCode": "lineFCode",
        "FType": "lineFType"
    })

    ### Find the downstream most point(s) on the flowline for each waterbody
    # This is used for snapping barriers, if possible.
    # Drop any where there is no flowline below the drain point (often pipelines
    # that were removed)
    tmp = wb_joins[["lineID", "wbID"]].set_index("lineID")
    drains = (joins.loc[joins.upstream_id.isin(wb_joins.lineID.unique())
                        & (joins.downstream_id != 0)].join(
                            tmp.wbID.rename("upstream_wbID"),
                            on="upstream_id").join(
                                tmp.wbID.rename("downstream_wbID"),
                                on="downstream_id"))

    # Only keep those that terminate outside the same waterbody as the upstream end
    drains = drains.loc[drains.upstream_wbID != drains.downstream_wbID].copy()

    # Join in stats from waterbodies and geometries from flowlines
    drain_pts = (wb_joins.loc[wb_joins.lineID.isin(
        drains.upstream_id.unique())].join(
            wb_atts,
            on="wbID",
        ).join(
            tmp_flowlines[["geometry", "loop", "TotDASqKm"]],
            on="lineID",
        ).reset_index(drop=True))

    # create a point from the last coordinate, which is the furthest one downstream
    drain_pts.geometry = pg.get_point(drain_pts.geometry.values.data, -1)

    # drop any that are downstream terminals; these are most likely waterbodies
    # that do not have further downstream networks (e.g., flow to ocean)
    ix = joins.loc[joins.upstream_id.isin(drain_pts.lineID)
                   & (joins.downstream_id == 0)].upstream_id
    drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)].copy()

    ### Find all drain points that share the same geometry.
    # These are most likely multiple segments that terminate in same drain point,
    # so we need to assign them their common downstream ID instead so that
    # snapping dams to these works properly later (otherwise snapped to only one of segments)
    drain_pts["hash"] = pd.util.hash_array(
        pg.to_wkb(drain_pts.geometry.values.data))
    s = drain_pts.groupby("hash").size()
    ix = drain_pts.hash.isin(s[s > 1].index)
    if ix.sum():
        print(f"Deduplicating {ix.sum():,} duplicate drain points")
        # find downstream_id for each of these, and deduplicate if there are multiple
        # downstreams, favoring the non-loops
        j = (joins.loc[joins.upstream_id.isin(drain_pts.loc[ix].lineID)
                       & (joins.downstream_id != 0),
                       ["upstream_id", "downstream_id", "loop"], ].sort_values(
                           by=["upstream_id", "loop"], ascending=True).groupby(
                               "upstream_id").first().downstream_id)

        drain_pts = drain_pts.join(j, on="lineID")

        # for those at same location that share the same downstream line, use that line instead
        s = (drain_pts.loc[drain_pts.downstream_id.notnull()].groupby(
            "downstream_id").size())
        ix = drain_pts.downstream_id.isin(s[s > 1].index.astype("uint32"))
        drain_pts.loc[ix, "lineID"] = drain_pts.loc[ix].downstream_id.astype(
            "uint32")
        # update the line properties to match that lineID
        lids = drain_pts.loc[ix].lineID.values
        drain_pts.loc[ix, "flowlineLength"] = flowlines.loc[lids,
                                                            "length"].values
        drain_pts.loc[ix, "loop"] = flowlines.loc[lids].loop.values
        drain_pts.loc[ix, "TotDASqKm"] = flowlines.loc[lids].TotDASqKm.values
        drain_pts = drain_pts.drop(columns=["downstream_id"])

    # keep the first unique drain point and sort the rest so they are oriented
    # from upstream to downstream
    drain_pts = (drain_pts.drop(columns=["hash"]).groupby(
        ["lineID", "wbID"]).first().sort_values(by="TotDASqKm",
                                                ascending=True).reset_index())

    drain_pts = gp.GeoDataFrame(drain_pts,
                                geometry="geometry",
                                crs=flowlines.crs)

    ### Deduplicate drains by network topology
    # Find the downstream-most drains for waterbodies when there are multiple distinct ones per waterbody.
    # These may result from flowlines that cross in and out of waterbodies multiple
    # times (not valid), or there may be drains on downstream loops
    # (esp. at dams) (valid).

    dups = drain_pts.groupby("wbID").size() > 1
    if dups.sum():
        print(
            f"Found {dups.sum():,} waterbodies with multiple drain points; cleaing up"
        )
        # find all waterbodies that have duplicate drains
        ix = drain_pts.wbID.isin(dups[dups].index)
        wb_ids = drain_pts.loc[ix].wbID.unique()
        # find all corresponding line IDs for these waterbodies
        line_ids = wb_joins.loc[wb_joins.wbID.isin(wb_ids)].lineID.unique()
        lines_per_wb = (drain_pts.loc[drain_pts.wbID.isin(wb_ids)].groupby(
            "wbID").lineID.unique())
        # search within 20 degrees removed from ids; this hopefully
        # picks up any gaps where lines exit waterbodies for a ways then re-enter
        # some floodplain areas have very big loops outside waterbody
        pairs = find_joins(
            joins,
            line_ids,
            downstream_col="downstream_id",
            upstream_col="upstream_id",
            expand=20,
        )[["upstream_id", "downstream_id"]]

        # remove any terminal points
        pairs = pairs.loc[(pairs.upstream_id != 0)
                          & (pairs.downstream_id != 0)]

        # create a directed graph facing DOWNSTREAM
        graph = DirectedGraph(pairs,
                              source="upstream_id",
                              target="downstream_id")
        # find all lines that are upstream of other lines
        # these are "parents" in the directed graph
        upstreams = graph.find_all_parents(lines_per_wb.values)
        ix = pd.Series(upstreams).explode().dropna().unique()
        print(
            f"Dropping {len(ix):,} drains that are upstream of other drains in the same waterbody"
        )
        drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)]

    ### check if drain points are on a loop and very close to the junction
    # of the loop and nonloop (e.g., Hoover Dam, HUC2 == 15)
    drain_pts["snap_to_junction"] = False
    drain_pts["snap_dist"] = 0

    drains_by_wb = drain_pts.groupby("wbID").size()
    multiple_drain_wb = drains_by_wb[drains_by_wb > 1].index

    # limit this to drain points on loops where there are multiple drains per waterbody
    loop_pts = drain_pts.loc[drain_pts.loop &
                             (drain_pts.wbID.isin(multiple_drain_wb))].copy()

    # search within 3 degrees removed from ids; this hopefully
    # picks up any downstream junctions
    pairs = find_joins(
        joins,
        loop_pts.lineID.unique(),
        downstream_col="downstream_id",
        upstream_col="upstream_id",
        expand=3,
    )[["upstream_id", "downstream_id"]]

    # drop endpoints
    pairs = pairs.loc[(pairs.upstream_id != 0)
                      & (pairs.downstream_id != 0)].copy()

    # find all junctions that have > 1 flowline upstream of them
    grouped = pairs.groupby("downstream_id").size()
    downstream_junctions = grouped[grouped > 1].index
    # extract upstream endoint for each junction line
    downstream_junction_pts = pd.Series(
        pg.get_point(flowlines.loc[downstream_junctions].geometry.values.data,
                     0),
        index=downstream_junctions,
    )
    # find the nearest junctions within 5m tolerance of drain points on loops
    tree = pg.STRtree(downstream_junction_pts.values.data)
    left, right = tree.nearest_all(loop_pts.geometry.values.data,
                                   max_distance=5)

    # make sure they are connected on the network
    g = DirectedGraph(pairs, source="upstream_id", target="downstream_id")
    ix = g.is_reachable(loop_pts.iloc[left].lineID.values,
                        downstream_junction_pts.iloc[right].index)
    left = left[ix]
    right = right[ix]

    if len(left):
        print(
            f"Found {len(left)} drains on loops within 5m upstream of a junction, updating them..."
        )
        # NOTE: these are attributed to the flowline that is DOWNSTREAM of the junction point
        # whereas other drains are attributed to the flowline upstream of themselves
        ix = loop_pts.index.take(left)
        drain_pts.loc[ix, "snap_to_junction"] = True
        drain_pts.loc[ix, "snap_dist"] = pg.distance(
            drain_pts.loc[ix].geometry.values.data,
            downstream_junction_pts.iloc[right].values,
        )
        drain_pts.loc[ix, "lineID"] = downstream_junction_pts.iloc[right].index
        drain_pts.loc[ix,
                      "geometry"] = downstream_junction_pts.iloc[right].values

    ### Extract the drain points of upstream headwaters waterbodies
    # these are flowlines that originate at a waterbody
    wb_geom = waterbodies.loc[waterbodies.flowlineLength == 0].geometry
    wb_geom = pd.Series(wb_geom.values.data, index=wb_geom.index)
    # take only the upstream most point
    tmp_flowline_pts = tmp_flowlines[["geometry", "loop", "TotDASqKm"]].copy()
    tmp_flowline_pts["geometry"] = pg.get_point(flowlines.geometry.values.data,
                                                0)
    fl_pt = pd.Series(tmp_flowline_pts.geometry.values.data,
                      index=tmp_flowline_pts.index)
    headwaters = (sjoin_geometry(
        wb_geom, fl_pt, predicate="intersects").rename("lineID").reset_index())
    headwaters = (headwaters.join(
        wb_atts,
        on="wbID",
    ).join(
        tmp_flowline_pts,
        on="lineID",
    ).reset_index(drop=True))
    headwaters["headwaters"] = True
    headwaters["snap_to_junction"] = False
    headwaters["snap_dist"] = 0
    print(
        f"Found {len(headwaters):,} headwaters waterbodies, adding drain points for these too"
    )

    drain_pts["headwaters"] = False
    drain_pts = drain_pts.append(headwaters, sort=False,
                                 ignore_index=True).reset_index(drop=True)

    # join in line properties
    drain_pts = drain_pts.drop(columns=["loop", "TotDASqKm"]).join(
        tmp_flowlines.drop(columns=["geometry"]), on="lineID")

    # calculate unique index
    huc_id = drain_pts["HUC4"].astype("uint16") * 1000000
    drain_pts["drainID"] = drain_pts.index.values.astype("uint32") + huc_id

    # Convert back to GeoDataFrame; above steps make it into a DataFrame
    drain_pts = gp.GeoDataFrame(drain_pts,
                                geometry="geometry",
                                crs=flowlines.crs)
    drain_pts.wbID = drain_pts.wbID.astype("uint32")
    drain_pts.lineID = drain_pts.lineID.astype("uint32")
    drain_pts.flowlineLength = drain_pts.flowlineLength.astype("float32")

    print("Done extracting {:,} waterbody drain points in {:.2f}s".format(
        len(drain_pts),
        time() - start))

    return drain_pts
    # NOTE: downstreams is indexed on id, not dams.index
    downstreams = (lines_by_dam.apply(find_downstreams).reset_index().explode(
        "lineID").drop_duplicates().set_index("id").lineID)

    # Now can just reduce dams back to these lineIDs
    dams = (dams[["id", "GNIS_Name", "geometry"]].join(
        downstreams, on="id",
        how="inner").drop_duplicates(subset=["id", "lineID"]).join(
            flowlines.geometry.rename("line"),
            on="lineID").reset_index(drop=True))
    print("Found {:,} joins between NHD dams and flowlines".format(len(dams)))

    ### Extract representative point
    # Look at either end of overlapping line and use that as representative point.
    # Otherwise intersect and extract first coordinate of overlapping line
    first = pg.get_point(dams.line, 0)
    intersects_first = pg.intersects(dams.geometry, first)
    ix = intersects_first
    dams.loc[ix, "pt"] = first.loc[ix]

    ix = ~intersects_first
    last = pg.get_point(dams.loc[ix].line, -1)
    intersects_last = pg.intersects(dams.loc[ix].geometry, last)
    last = last.loc[intersects_last]
    dams.loc[last.index, "pt"] = last

    ix = dams.pt.isnull()
    # WARNING: this might fail for odd intersection geoms
    pt = pg.get_point(
        pg.intersection(dams.loc[ix].geometry, dams.loc[ix].line), 0).dropna()
    dams.loc[pt.index, "pt"] = pt