Esempio n. 1
0
def extract_marine(gdb_path, target_crs):
    """Extract areas from NHDWaterbody and NHDArea that are marine connected.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.

    Returns
    -------
    GeoDataFrame
    """

    print("Reading marine areas...")
    area = read_dataframe(
        gdb_path,
        layer="NHDArea",
        columns=COLS,
        force_2d=True,
        where=f"FType in {tuple(AREA_FTYPES)}",
    )

    wb = read_dataframe(
        gdb_path,
        layer="NHDWaterbody",
        columns=COLS,
        force_2d=True,
        # more complex expression when list is size 1
        where=f"FType in ({','.join([str(t) for t in WB_FTYPES])})",
    )

    df = area.append(wb)

    if len(df):
        df = explode(df.to_crs(target_crs))

    return df
    nhd_pts.append(nhd_lines, ignore_index=True, sort=False)
    .append(nhd_areas, ignore_index=True, sort=False)
    .reset_index(drop=True)
)

# find contiguous groups for dissolve
nhd_dams = nhd_dams.join(find_contiguous_groups(nhd_dams.geometry.values.data))
# fill in the isolated dams
ix = nhd_dams.group.isnull()
next_group = nhd_dams.group.max() + 1
nhd_dams.loc[ix, "group"] = next_group + np.arange(ix.sum())
nhd_dams.group = nhd_dams.group.astype("uint")

print("Dissolving overlapping dams")
nhd_dams = dissolve(
    explode(nhd_dams),
    by=["HUC2", "source", "group"],
    agg={
        "GNIS_Name": lambda n: ", ".join({s for s in n if s}),
        # set missing NHD fields as 0
        "FType": lambda n: ", ".join({str(s) for s in n}),
        "FCode": lambda n: ", ".join({str(s) for s in n}),
        "NHDPlusID": lambda n: ", ".join({str(s) for s in n}),
    },
).reset_index(drop=True)

# fill in missing values
nhd_dams.GNIS_Name = nhd_dams.GNIS_Name.fillna("")

nhd_dams.geometry = pg.make_valid(nhd_dams.geometry.values.data)
Esempio n. 3
0
    nwi = gp.read_feather(nwi_dir / huc2 / "waterbodies.feather")

    df = nhd[["geometry", "altered"]].append(nwi[["geometry", "altered"]])

    altered = df.loc[df.altered].copy()

    if huc2 == "03":
        sc = gp.read_feather("data/states/sc/sc_waterbodies.feather", columns=[])
        sc["altered"] = False  # unknown
        df = df.append(sc[["geometry", "altered"]])

    print(f"Dissolving {len(df):,} waterbodies...")
    dissolve_start = time()
    df["tmp"] = 1
    df = dissolve(df, by="tmp").drop(columns=["tmp"])
    df = explode(df).reset_index(drop=True)
    print(f"Now have {len(df):,} waterbodies ({time() - dissolve_start:,.2f}s)")

    # assign altered if any resulting polygons intersect altered polygons
    tree = pg.STRtree(df.geometry.values.data)
    left, right = tree.query_bulk(altered.geometry.values.data)
    df["altered"] = False
    df.loc[np.unique(right), "altered"] = True

    # cut at breaks from NHD
    nhd_lines_filename = nhd_dir / huc2 / "nhd_lines.feather"
    if nhd_lines_filename.exists():
        print("Checking for breaks between adjacent waterbodies")
        nhd_lines = gp.read_feather(nhd_lines_filename).geometry.values.data
        breaks = find_nhd_waterbody_breaks(nhd.geometry.values.data, nhd_lines)
Esempio n. 4
0
        rivers = append(
            rivers,
            df.loc[(df.nwi_type == "Riverine")
                   & (df.altered)].drop(columns=["nwi_type"]),
        )

    ### Process waterbodies
    # only keep that intersect flowlines
    print(f"Extracted {len(waterbodies):,} NWI lakes and ponds")
    left, right = tree.query_bulk(waterbodies.geometry.values.data,
                                  predicate="intersects")
    waterbodies = waterbodies.iloc[np.unique(left)].reset_index(drop=True)
    print(f"Kept {len(waterbodies):,} that intersect flowlines")

    # TODO: explode, repair, dissolve, explode, reset index
    waterbodies = explode(waterbodies)
    # make valid
    ix = ~pg.is_valid(waterbodies.geometry.values.data)
    if ix.sum():
        print(f"Repairing {ix.sum():,} invalid waterbodies")
        waterbodies.loc[ix, "geometry"] = pg.make_valid(
            waterbodies.loc[ix].geometry.values.data)

    # note: nwi_code, nwi_type are discarded here since they aren't used later
    print("Dissolving adjacent waterbodies")
    waterbodies = dissolve(waterbodies, by=["altered"])
    waterbodies = explode(waterbodies).reset_index(drop=True)

    waterbodies["km2"] = pg.area(waterbodies.geometry.values.data) / 1e6

    waterbodies.to_feather(huc2_dir / "waterbodies.feather")
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, next_lineID):
    """
    Cut lines by waterbodies.
    1. Finds all intersections between waterbodies and flowlines.
    2. For those that cross but are not completely contained by waterbodies, cut them.
    3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts.
    4. Any flowlines that are not contained or crossing waterbodies are dropped from wb_joins

    Parameters
    ----------
    flowlines : GeoDataFrame
    joins : DataFrame
        flowline joins
    waterbodies : GeoDataFrame
    next_lineID : int
        next lineID; must be greater than all prior lines in region

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame)
        (flowlines, joins, waterbodies, waterbody joins)
    """

    start = time()

    ### Find flowlines that intersect waterbodies

    join_start = time()
    tree = pg.STRtree(flowlines.geometry.values.data)
    left, right = tree.query_bulk(waterbodies.geometry.values.data,
                                  predicate="intersects")
    df = pd.DataFrame({
        "lineID": flowlines.index.take(right),
        "flowline": flowlines.geometry.values.data.take(right),
        "wbID": waterbodies.index.take(left),
        "waterbody": waterbodies.geometry.values.data.take(left),
    })
    print(
        f"Found {len(df):,} waterbody / flowline joins in {time() - join_start:.2f}s"
    )

    ### Find those that are completely contained; these don't need further processing
    pg.prepare(df.waterbody.values)

    # find those that are fully contained and do not touch the edge of the waterbody (contains_properly predicate)
    # contains_properly is very fast
    contained_start = time()
    df["contains"] = pg.contains_properly(df.waterbody.values,
                                          df.flowline.values)
    print(
        f"Identified {df.contains.sum():,} flowlines fully within waterbodies in {time() - contained_start:.2f}s"
    )

    # find those that aren't fully contained by contained and touch the edge of waterbody (contains predicate)
    contained_start = time()
    ix = ~df.contains
    tmp = df.loc[ix]
    df.loc[ix, "contains"] = pg.contains(tmp.waterbody, tmp.flowline)
    print(
        f"Identified {df.loc[ix].contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s"
    )

    # Sanity check: flowlines should only ever be contained by one waterbody
    if df.loc[df.contains].groupby("lineID").size().max() > 1:
        raise ValueError(
            "ERROR: one or more lines contained by multiple waterbodies")

    # for any that are not completely contained, find the ones that overlap
    crosses_start = time()
    df["crosses"] = False
    ix = ~df.contains
    tmp = df.loc[ix]
    df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline)
    print(
        f"Identified {df.crosses.sum():,} flowlines that cross edge of waterbodies in {time() - crosses_start:.2f}s"
    )

    # discard any that only touch (ones that don't cross or are contained)
    # note that we only cut the ones that cross below; contained ones are left intact
    df = df.loc[df.contains | df.crosses].copy()

    print("Intersecting flowlines and waterbodies...")
    cut_start = time()
    ix = df.crosses
    tmp = df.loc[ix]
    df["geometry"] = df.flowline
    # use intersection to cut flowlines by waterbodies.  Note: this may produce
    # nonlinear (e.g., geom collection) results
    df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody)
    df["length"] = pg.length(df.geometry)
    df["flength"] = pg.length(df.flowline)

    # Cut lines that are long enough and different enough from the original lines
    df["to_cut"] = False
    tmp = df.loc[df.crosses]
    keep = (tmp.crosses
            & (tmp.length >= CUT_TOLERANCE)
            & ((tmp.flength - tmp.length).abs() >= CUT_TOLERANCE))
    df.loc[keep[keep].index, "to_cut"] = True
    df["inside"] = (df.length / df.flength).clip(0, 1)
    print(
        f"Found {df.to_cut.sum():,} segments that need to be cut by flowlines in {time() - cut_start:.2f}s"
    )

    # save all that are completely contained or mostly contained.
    # They must be at least 50% in waterbody to be considered mostly contained.
    # Note: there are some that are mostly outside and we exclude those here.
    # We then update this after cutting
    contained = df.loc[df.inside >= 0.5, ["wbID", "lineID"]].copy()

    ### Cut lines
    if df.to_cut.sum():
        # only work with those to cut from here on out
        df = df.loc[df.to_cut,
                    ["lineID", "flowline", "wbID", "waterbody"]].reset_index(
                        drop=True)

        # save waterbody ids to re-evaluate intersection after cutting
        wbID = df.wbID.unique()

        # extract all intersecting interior rings for these waterbodies
        print("Extracting interior rings for intersected waterbodies")
        wb = waterbodies.loc[waterbodies.index.isin(wbID)]
        outer_index, inner_index, rings = get_interior_rings(
            wb.geometry.values.data)
        if len(outer_index):
            # find the pairs of waterbody rings and lines to add
            rings = np.asarray(rings)
            wb_with_rings = wb.index.values.take(outer_index)
            lines_in_wb = df.loc[df.wbID.isin(wb_with_rings)].lineID.unique()
            lines_in_wb = flowlines.loc[flowlines.index.isin(
                lines_in_wb)].geometry
            tree = pg.STRtree(rings)
            left, right = tree.query_bulk(lines_in_wb.values.data,
                                          predicate="intersects")

            tmp = pd.DataFrame({
                "lineID": lines_in_wb.index.values.take(left),
                "flowline": lines_in_wb.values.data.take(left),
                "wbID": wb_with_rings.take(right),
                "waterbody": rings.take(right),
            })
            df = df.append(tmp, ignore_index=True, sort=False)

        # extract the outer ring for original waterbodies
        ix = pg.get_type_id(df.waterbody.values.data) == 3
        df.loc[ix, "waterbody"] = pg.get_exterior_ring(
            df.loc[ix].waterbody.values.data)

        # Calculate all geometric intersections between the flowlines and
        # waterbody rings and drop any that are not points
        # Note: these may be multipoints where line crosses the ring of waterbody
        # multiple times.
        # We ignore any shared edges, etc that result from the intersection; those
        # aren't helpful for cutting the lines
        print("Finding cut points...")
        df["geometry"] = pg.intersection(df.flowline.values,
                                         df.waterbody.values)
        df = explode(
            explode(
                gp.GeoDataFrame(df[["geometry", "lineID", "flowline"]],
                                crs=flowlines.crs))).reset_index()
        points = (df.loc[pg.get_type_id(df.geometry.values.data) ==
                         0].set_index("lineID").geometry)

        print("cutting flowlines")
        cut_start = time()
        flowlines, joins = cut_flowlines_at_points(flowlines,
                                                   joins,
                                                   points,
                                                   next_lineID=next_lineID)
        new_flowlines = flowlines.loc[flowlines.new]

        print(
            f"{len(new_flowlines):,} new flowlines created in {time() - cut_start:,.2f}s"
        )

        if len(new_flowlines):
            # remove any flowlines no longer present (they were replaced by cut lines)
            contained = contained.loc[contained.lineID.isin(
                flowlines.loc[~flowlines.new].index.unique())].copy()

            contained_start = time()
            # recalculate overlaps with waterbodies
            print("Recalculating overlaps with waterbodies")
            wb = waterbodies.loc[wbID]
            tree = pg.STRtree(new_flowlines.geometry.values.data)
            left, right = tree.query_bulk(wb.geometry.values.data,
                                          predicate="intersects")

            df = pd.DataFrame({
                "lineID":
                new_flowlines.index.take(right),
                "flowline":
                new_flowlines.geometry.values.data.take(right),
                "wbID":
                wb.index.take(left),
                "waterbody":
                wb.geometry.values.data.take(left),
            })

            pg.prepare(df.waterbody.values)
            df["contains"] = pg.contains(df.waterbody.values,
                                         df.flowline.values)
            print(
                f"Identified {df.contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s"
            )

            # some aren't perfectly contained, add those that are mostly in
            df["crosses"] = False
            ix = ~df.contains
            tmp = df.loc[ix]
            df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline)

            # discard any that only touch (don't cross or are contained)
            df = df.loc[df.contains | df.crosses].copy()

            tmp = df.loc[df.crosses]
            df["geometry"] = df.flowline
            # use intersection to cut flowlines by waterbodies.  Note: this may produce
            # nonlinear (e.g., geom collection) results
            df.loc[ix, "geometry"] = pg.intersection(tmp.flowline,
                                                     tmp.waterbody)
            df["length"] = pg.length(df.geometry)
            df["flength"] = pg.length(df.flowline)

            # keep any that are contained or >= 50% in waterbody
            contained = contained.append(
                df.loc[df.contains | ((df.length / df.flength) >= 0.5),
                       ["wbID", "lineID"]],
                ignore_index=True,
            )

        flowlines = flowlines.drop(columns=["new"])

    # make sure that updated joins are unique
    joins = joins.drop_duplicates()

    # make sure that wb_joins is unique
    contained = contained.groupby(by=["lineID", "wbID"]).first().reset_index()

    # set flag for flowlines in waterbodies
    flowlines["waterbody"] = flowlines.index.isin(contained.lineID.unique())

    print("Done evaluating waterbody / flowline overlap in {:.2f}s".format(
        time() - start))

    return flowlines, joins, contained
# Clip HUC4 areas outside state boundaries; these are remainder
state_merged = pg.coverage_union_all(state_df.geometry.values.data)

# find all that intersect but are not contained
tree = pg.STRtree(huc4_df.geometry.values.data)
intersects_ix = tree.query(state_merged, predicate="intersects")
contains_ix = tree.query(state_merged, predicate="contains")
ix = np.setdiff1d(intersects_ix, contains_ix)

outer_huc4 = huc4_df.iloc[ix].copy()
outer_huc4["km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6

# calculate geometric difference, explode, and keep non-slivers
outer_huc4["geometry"] = pg.difference(outer_huc4.geometry.values.data,
                                       state_merged)
outer_huc4 = explode(outer_huc4)
outer_huc4["clip_km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6
outer_huc4["percent"] = 100 * outer_huc4.clip_km2 / outer_huc4.km2
keep_huc4 = outer_huc4.loc[outer_huc4.clip_km2 >= 100].HUC4.unique()
outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4)
                            & (outer_huc4.clip_km2 >= 2.5)].copy()
outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={
    "HUC2": "first"
}).reset_index(drop=True)
outer_huc4.to_feather(out_dir / "outer_huc4.feather")
write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg")

### Counties - within HUC4 bounds
print("Processing counties")
fips = sorted(state_df.STATEFIPS.unique())