def extract_marine(gdb_path, target_crs): """Extract areas from NHDWaterbody and NHDArea that are marine connected. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. Returns ------- GeoDataFrame """ print("Reading marine areas...") area = read_dataframe( gdb_path, layer="NHDArea", columns=COLS, force_2d=True, where=f"FType in {tuple(AREA_FTYPES)}", ) wb = read_dataframe( gdb_path, layer="NHDWaterbody", columns=COLS, force_2d=True, # more complex expression when list is size 1 where=f"FType in ({','.join([str(t) for t in WB_FTYPES])})", ) df = area.append(wb) if len(df): df = explode(df.to_crs(target_crs)) return df
nhd_pts.append(nhd_lines, ignore_index=True, sort=False) .append(nhd_areas, ignore_index=True, sort=False) .reset_index(drop=True) ) # find contiguous groups for dissolve nhd_dams = nhd_dams.join(find_contiguous_groups(nhd_dams.geometry.values.data)) # fill in the isolated dams ix = nhd_dams.group.isnull() next_group = nhd_dams.group.max() + 1 nhd_dams.loc[ix, "group"] = next_group + np.arange(ix.sum()) nhd_dams.group = nhd_dams.group.astype("uint") print("Dissolving overlapping dams") nhd_dams = dissolve( explode(nhd_dams), by=["HUC2", "source", "group"], agg={ "GNIS_Name": lambda n: ", ".join({s for s in n if s}), # set missing NHD fields as 0 "FType": lambda n: ", ".join({str(s) for s in n}), "FCode": lambda n: ", ".join({str(s) for s in n}), "NHDPlusID": lambda n: ", ".join({str(s) for s in n}), }, ).reset_index(drop=True) # fill in missing values nhd_dams.GNIS_Name = nhd_dams.GNIS_Name.fillna("") nhd_dams.geometry = pg.make_valid(nhd_dams.geometry.values.data)
nwi = gp.read_feather(nwi_dir / huc2 / "waterbodies.feather") df = nhd[["geometry", "altered"]].append(nwi[["geometry", "altered"]]) altered = df.loc[df.altered].copy() if huc2 == "03": sc = gp.read_feather("data/states/sc/sc_waterbodies.feather", columns=[]) sc["altered"] = False # unknown df = df.append(sc[["geometry", "altered"]]) print(f"Dissolving {len(df):,} waterbodies...") dissolve_start = time() df["tmp"] = 1 df = dissolve(df, by="tmp").drop(columns=["tmp"]) df = explode(df).reset_index(drop=True) print(f"Now have {len(df):,} waterbodies ({time() - dissolve_start:,.2f}s)") # assign altered if any resulting polygons intersect altered polygons tree = pg.STRtree(df.geometry.values.data) left, right = tree.query_bulk(altered.geometry.values.data) df["altered"] = False df.loc[np.unique(right), "altered"] = True # cut at breaks from NHD nhd_lines_filename = nhd_dir / huc2 / "nhd_lines.feather" if nhd_lines_filename.exists(): print("Checking for breaks between adjacent waterbodies") nhd_lines = gp.read_feather(nhd_lines_filename).geometry.values.data breaks = find_nhd_waterbody_breaks(nhd.geometry.values.data, nhd_lines)
rivers = append( rivers, df.loc[(df.nwi_type == "Riverine") & (df.altered)].drop(columns=["nwi_type"]), ) ### Process waterbodies # only keep that intersect flowlines print(f"Extracted {len(waterbodies):,} NWI lakes and ponds") left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") waterbodies = waterbodies.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(waterbodies):,} that intersect flowlines") # TODO: explode, repair, dissolve, explode, reset index waterbodies = explode(waterbodies) # make valid ix = ~pg.is_valid(waterbodies.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum():,} invalid waterbodies") waterbodies.loc[ix, "geometry"] = pg.make_valid( waterbodies.loc[ix].geometry.values.data) # note: nwi_code, nwi_type are discarded here since they aren't used later print("Dissolving adjacent waterbodies") waterbodies = dissolve(waterbodies, by=["altered"]) waterbodies = explode(waterbodies).reset_index(drop=True) waterbodies["km2"] = pg.area(waterbodies.geometry.values.data) / 1e6 waterbodies.to_feather(huc2_dir / "waterbodies.feather")
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, next_lineID): """ Cut lines by waterbodies. 1. Finds all intersections between waterbodies and flowlines. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from wb_joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame next_lineID : int next lineID; must be greater than all prior lines in region Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() ### Find flowlines that intersect waterbodies join_start = time() tree = pg.STRtree(flowlines.geometry.values.data) left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": flowlines.index.take(right), "flowline": flowlines.geometry.values.data.take(right), "wbID": waterbodies.index.take(left), "waterbody": waterbodies.geometry.values.data.take(left), }) print( f"Found {len(df):,} waterbody / flowline joins in {time() - join_start:.2f}s" ) ### Find those that are completely contained; these don't need further processing pg.prepare(df.waterbody.values) # find those that are fully contained and do not touch the edge of the waterbody (contains_properly predicate) # contains_properly is very fast contained_start = time() df["contains"] = pg.contains_properly(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} flowlines fully within waterbodies in {time() - contained_start:.2f}s" ) # find those that aren't fully contained by contained and touch the edge of waterbody (contains predicate) contained_start = time() ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "contains"] = pg.contains(tmp.waterbody, tmp.flowline) print( f"Identified {df.loc[ix].contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # Sanity check: flowlines should only ever be contained by one waterbody if df.loc[df.contains].groupby("lineID").size().max() > 1: raise ValueError( "ERROR: one or more lines contained by multiple waterbodies") # for any that are not completely contained, find the ones that overlap crosses_start = time() df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) print( f"Identified {df.crosses.sum():,} flowlines that cross edge of waterbodies in {time() - crosses_start:.2f}s" ) # discard any that only touch (ones that don't cross or are contained) # note that we only cut the ones that cross below; contained ones are left intact df = df.loc[df.contains | df.crosses].copy() print("Intersecting flowlines and waterbodies...") cut_start = time() ix = df.crosses tmp = df.loc[ix] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # Cut lines that are long enough and different enough from the original lines df["to_cut"] = False tmp = df.loc[df.crosses] keep = (tmp.crosses & (tmp.length >= CUT_TOLERANCE) & ((tmp.flength - tmp.length).abs() >= CUT_TOLERANCE)) df.loc[keep[keep].index, "to_cut"] = True df["inside"] = (df.length / df.flength).clip(0, 1) print( f"Found {df.to_cut.sum():,} segments that need to be cut by flowlines in {time() - cut_start:.2f}s" ) # save all that are completely contained or mostly contained. # They must be at least 50% in waterbody to be considered mostly contained. # Note: there are some that are mostly outside and we exclude those here. # We then update this after cutting contained = df.loc[df.inside >= 0.5, ["wbID", "lineID"]].copy() ### Cut lines if df.to_cut.sum(): # only work with those to cut from here on out df = df.loc[df.to_cut, ["lineID", "flowline", "wbID", "waterbody"]].reset_index( drop=True) # save waterbody ids to re-evaluate intersection after cutting wbID = df.wbID.unique() # extract all intersecting interior rings for these waterbodies print("Extracting interior rings for intersected waterbodies") wb = waterbodies.loc[waterbodies.index.isin(wbID)] outer_index, inner_index, rings = get_interior_rings( wb.geometry.values.data) if len(outer_index): # find the pairs of waterbody rings and lines to add rings = np.asarray(rings) wb_with_rings = wb.index.values.take(outer_index) lines_in_wb = df.loc[df.wbID.isin(wb_with_rings)].lineID.unique() lines_in_wb = flowlines.loc[flowlines.index.isin( lines_in_wb)].geometry tree = pg.STRtree(rings) left, right = tree.query_bulk(lines_in_wb.values.data, predicate="intersects") tmp = pd.DataFrame({ "lineID": lines_in_wb.index.values.take(left), "flowline": lines_in_wb.values.data.take(left), "wbID": wb_with_rings.take(right), "waterbody": rings.take(right), }) df = df.append(tmp, ignore_index=True, sort=False) # extract the outer ring for original waterbodies ix = pg.get_type_id(df.waterbody.values.data) == 3 df.loc[ix, "waterbody"] = pg.get_exterior_ring( df.loc[ix].waterbody.values.data) # Calculate all geometric intersections between the flowlines and # waterbody rings and drop any that are not points # Note: these may be multipoints where line crosses the ring of waterbody # multiple times. # We ignore any shared edges, etc that result from the intersection; those # aren't helpful for cutting the lines print("Finding cut points...") df["geometry"] = pg.intersection(df.flowline.values, df.waterbody.values) df = explode( explode( gp.GeoDataFrame(df[["geometry", "lineID", "flowline"]], crs=flowlines.crs))).reset_index() points = (df.loc[pg.get_type_id(df.geometry.values.data) == 0].set_index("lineID").geometry) print("cutting flowlines") cut_start = time() flowlines, joins = cut_flowlines_at_points(flowlines, joins, points, next_lineID=next_lineID) new_flowlines = flowlines.loc[flowlines.new] print( f"{len(new_flowlines):,} new flowlines created in {time() - cut_start:,.2f}s" ) if len(new_flowlines): # remove any flowlines no longer present (they were replaced by cut lines) contained = contained.loc[contained.lineID.isin( flowlines.loc[~flowlines.new].index.unique())].copy() contained_start = time() # recalculate overlaps with waterbodies print("Recalculating overlaps with waterbodies") wb = waterbodies.loc[wbID] tree = pg.STRtree(new_flowlines.geometry.values.data) left, right = tree.query_bulk(wb.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": new_flowlines.index.take(right), "flowline": new_flowlines.geometry.values.data.take(right), "wbID": wb.index.take(left), "waterbody": wb.geometry.values.data.take(left), }) pg.prepare(df.waterbody.values) df["contains"] = pg.contains(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # some aren't perfectly contained, add those that are mostly in df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) # discard any that only touch (don't cross or are contained) df = df.loc[df.contains | df.crosses].copy() tmp = df.loc[df.crosses] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # keep any that are contained or >= 50% in waterbody contained = contained.append( df.loc[df.contains | ((df.length / df.flength) >= 0.5), ["wbID", "lineID"]], ignore_index=True, ) flowlines = flowlines.drop(columns=["new"]) # make sure that updated joins are unique joins = joins.drop_duplicates() # make sure that wb_joins is unique contained = contained.groupby(by=["lineID", "wbID"]).first().reset_index() # set flag for flowlines in waterbodies flowlines["waterbody"] = flowlines.index.isin(contained.lineID.unique()) print("Done evaluating waterbody / flowline overlap in {:.2f}s".format( time() - start)) return flowlines, joins, contained
# Clip HUC4 areas outside state boundaries; these are remainder state_merged = pg.coverage_union_all(state_df.geometry.values.data) # find all that intersect but are not contained tree = pg.STRtree(huc4_df.geometry.values.data) intersects_ix = tree.query(state_merged, predicate="intersects") contains_ix = tree.query(state_merged, predicate="contains") ix = np.setdiff1d(intersects_ix, contains_ix) outer_huc4 = huc4_df.iloc[ix].copy() outer_huc4["km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6 # calculate geometric difference, explode, and keep non-slivers outer_huc4["geometry"] = pg.difference(outer_huc4.geometry.values.data, state_merged) outer_huc4 = explode(outer_huc4) outer_huc4["clip_km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6 outer_huc4["percent"] = 100 * outer_huc4.clip_km2 / outer_huc4.km2 keep_huc4 = outer_huc4.loc[outer_huc4.clip_km2 >= 100].HUC4.unique() outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4) & (outer_huc4.clip_km2 >= 2.5)].copy() outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={ "HUC2": "first" }).reset_index(drop=True) outer_huc4.to_feather(out_dir / "outer_huc4.feather") write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg") ### Counties - within HUC4 bounds print("Processing counties") fips = sorted(state_df.STATEFIPS.unique())