if not out_dir.exists(): os.makedirs(out_dir) start = time() huc2s = sorted( pd.read_feather( data_dir / "boundaries/huc4.feather", columns=["HUC2"] ).HUC2.unique() ) ### Merge NHD lines and areas that represent dams and dam-related features print("Reading NHD points, lines, and areas, and merging...") nhd_pts = read_feathers( [raw_dir / huc2 / "nhd_points.feather" for huc2 in huc2s], geo=True, new_fields={"HUC2": huc2s}, ) nhd_pts = nhd_pts.loc[nhd_pts.FType.isin([343])].copy() # write original points for SARP write_dataframe(nhd_pts, out_dir / "nhd_dam_pts_nhdpoint.fgb") nhd_pts["source"] = "NHDPoint" # create circular buffers to merge with others nhd_pts["geometry"] = pg.buffer(nhd_pts.geometry.values.data, 5) nhd_lines = read_feathers( [raw_dir / huc2 / "nhd_lines.feather" for huc2 in huc2s],
# "14", # "15", # "16", # "17", # "21", # ] start = time() ### Aggregate barriers kinds = ["waterfall", "dam", "small_barrier"] kind_ids = [WATERFALLS_ID, DAMS_ID, SB_ID] barriers = read_feathers( [barriers_dir / f"{kind}s.feather" for kind in kinds], geo=True, new_fields={"kind": kinds}, ) for kind, init_id in zip(kinds, kind_ids): ix = barriers.kind == kind barriers.loc[ix, "barrierID"] = barriers.loc[ix].id + init_id barriers.barrierID = barriers.barrierID.astype("uint64") barriers.to_feather(out_dir / "all_barriers.feather") if DEBUG: write_dataframe(barriers, out_dir / "all_barriers.fgb") ### Cut flowlines in each HUC2 for huc2 in huc2s:
dups.loc[ix, "dup_tolerance"] = DUPLICATE_TOLERANCE["likely duplicate"] export_duplicate_areas(dups, qa_dir / "dams_duplicate_areas.fgb") ### Join to line atts flowlines = ( read_feathers( [ nhd_dir / "clean" / huc2 / "flowlines.feather" for huc2 in df.HUC2.unique() if huc2 ], columns=[ "lineID", "NHDPlusID", "GNIS_Name", "sizeclass", "StreamOrde", "FCode", "loop", ], ) .rename(columns={"StreamOrde": "StreamOrder"}) .set_index("lineID") ) df = df.join(flowlines, on="lineID") df.StreamOrder = df.StreamOrder.fillna(-1).astype("int8") # Add name from snapped flowline if not already present
) .set_index("NHDPlusID") .rename(columns={"nat_floodplain_km2": "natfldkm2", "floodplain_km2": "fldkm2"}) ) floodplains["natfldpln"] = (100 * floodplains.natfldkm2 / floodplains.fldkm2).astype( "float32" ) # HUC2s that specifically overlap SECAS states (SARP states + WV) for group in huc2_groups: segments = ( read_feathers( [src_dir / "clean" / huc2 / "network_segments.feather" for huc2 in group], columns=["lineID", barrier_type], ) .rename(columns={barrier_type: "networkID"}) .set_index("lineID") ) stats = ( read_feathers( [ src_dir / "clean" / huc2 / f"{barrier_type}_network_stats.feather" for huc2 in group ], columns=[ "networkID", "total_miles", "perennial_miles", "intermittent_miles",
# "12", # "13", # "14", # "15", # "16", # "17", # "21", # ] print("Finding connected HUC2s") joins = read_feathers( [src_dir / huc2 / "flowline_joins.feather" for huc2 in huc2s], columns=[ "upstream", "downstream", "type", "marine", "upstream_id", "downstream_id", ], new_fields={"HUC2": huc2s}, ) groups, joins = connect_huc2s(joins) print(f"Found {len(groups)} HUC2 groups in {time() - start:,.2f}s") # persist table of connected HUC2s connected_huc2s = pd.DataFrame({"HUC2": groups}).explode(column="HUC2") connected_huc2s["group"] = connected_huc2s.index.astype("uint8") connected_huc2s.reset_index(drop=True).to_feather( data_dir / "networks/connected_huc2s.feather")
start = time() huc2_df = pd.read_feather(data_dir / "boundaries/huc2.feather", columns=["HUC2"]) huc2s = huc2_df.HUC2.sort_values().values drains = (read_feathers( [ data_dir / "nhd/clean/" / huc2 / "waterbody_drain_points.feather" for huc2 in huc2s ], new_fields={ "HUC2": huc2s }, geo=True, ).drop(columns=["snap_to_junction", "snap_dist"]).rename( columns={ "MaxElevSmo": "maxelev", "MinElevSmo": "minelev", "Slope": "slope", "StreamOrde": "fsorder", "km2": "wb_km2", "flowlineLength": "flength", }).set_index("drainID")) drains["intermittent"] = drains.lineFCode.isin([46003, 46007]) merged = None for huc2 in huc2s: huc2_start = time() print(f"Extracting dams from waterbodies in {huc2}")
def get_network_results(df, network_type, barrier_type=None, rank=True): """Read network results, calculate derived metric classes, and calculate tiers. Only barriers that are not unranked (invasive spp barriers) have tiers calculated. Parameters ---------- df : DataFrame barriers data; must contain State and unranked network_type : {"dams", "small_barriers"} network scenario barrier_type : {"dams", "small_barriers", "waterfalls"}, optional (default: None) if present, used to filter barrier kind from network results rank : bool, optional (default: True) if True, results will include tiers for the Southeast and state level Returns ------- DataFrame Contains network metrics and tiers """ barrier_type = barrier_type or network_type huc2s = [huc2 for huc2 in df.HUC2.unique() if huc2] networks = ( read_feathers( [ Path("data/networks/clean") / huc2 / f"{network_type}_network.feather" for huc2 in huc2s ], columns=NETWORK_COLUMNS, ) .rename(columns=NETWORK_COLUMN_NAMES) .set_index("id") ) # FIXME: temporary fix networks.PercentPerennialUnaltered = networks.PercentPerennialUnaltered.fillna(0) # select barrier type networks = networks.loc[networks.kind == barrier_type[:-1]].drop(columns=["kind"]) # Convert dtypes to allow missing data when joined to barriers later # NOTE: upNetID or downNetID may be 0 if there aren't networks on that side, but # we set to int dtype instead of uint to allow -1 for missing data later for col in ["upNetID", "downNetID"]: networks[col] = networks[col].astype("int") for col in ["NumBarriersDownstream"]: networks[col] = networks[col].astype("int16") for column in ("Landcover", "SizeClasses", "FlowsToOcean"): networks[column] = networks[column].astype("int8") # sanity check to make sure no duplicate networks if networks.groupby(level=0).size().max() > 1: raise Exception( f"ERROR: multiple networks found for some {barrier_type} networks" ) networks = networks.join(df[df.columns.intersection(["unranked", "State"])]) # update data types and calculate total fields # calculate size classes GAINED instead of total # doesn't apply to those that don't have upstream networks networks.loc[networks.SizeClasses > 0, "SizeClasses"] -= 1 # Calculate miles GAINED if barrier is removed # this is the lesser of the upstream or free downstream lengths. # Non-free miles downstream (downstream waterbodies) are omitted from this analysis. networks["GainMiles"] = networks[["TotalUpstreamMiles", "FreeDownstreamMiles"]].min( axis=1 ) networks["PerennialGainMiles"] = networks[ ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"] ].min(axis=1) # TotalNetworkMiles is sum of upstream and free downstream miles networks["TotalNetworkMiles"] = networks[ ["TotalUpstreamMiles", "FreeDownstreamMiles"] ].sum(axis=1) networks["TotalPerennialNetworkMiles"] = networks[ ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"] ].sum(axis=1) # Round floating point columns to 3 decimals for column in [c for c in networks.columns if c.endswith("Miles")]: networks[column] = networks[column].round(3).fillna(-1).astype("float32") # Calculate network metric classes networks["GainMilesClass"] = classify_gainmiles(networks.GainMiles) networks["PerennialGainMilesClass"] = classify_gainmiles( networks.PerennialGainMiles ) if not rank: return networks.drop(columns=["unranked", "State"], errors="ignore") # only calculate ranks / tiers for ranked barriers # (exclude unranked invasive spp. barriers) to_rank = networks.loc[~networks.unranked] ### Calculate regional tiers for SARP (Southeast) region # NOTE: this is limited to SARP region; other regions are not ranked at regional level # TODO: consider deprecating this ix = to_rank.State.isin(SARP_STATE_NAMES) sarp_tiers = calculate_tiers(to_rank.loc[ix]) sarp_tiers = sarp_tiers.rename( columns={col: f"SE_{col}" for col in sarp_tiers.columns} ) ### Calculate state tiers for each of total and perennial state_tiers = None for state in to_rank.State.unique(): state_tiers = append( state_tiers, calculate_tiers(to_rank.loc[to_rank.State == state]).reset_index(), ) state_tiers = state_tiers.set_index("id").rename( columns={col: f"State_{col}" for col in state_tiers.columns} ) networks = networks.join(sarp_tiers).join(state_tiers) for col in [col for col in networks.columns if col.endswith("_tier")]: networks[col] = networks[col].fillna(-1).astype("int8") return networks.drop(columns=["unranked", "State"])
if not out_dir.exists(): os.makedirs(out_dir) barrier_type = "small_barriers" ext = "fgb" # groups_df = pd.read_feather(src_dir / "connected_huc2s.feather") # for group in groups_df.groupby("group").HUC2.apply(set).values: for group in [{"02"}]: segments = (read_feathers( [ src_dir / "clean" / huc2 / "network_segments.feather" for huc2 in group ], columns=["lineID", barrier_type], ).rename(columns={ barrier_type: "networkID" }).set_index("lineID")) # FIXME: remove, debug only s = segments.groupby(level=0).size() print("dups", s[s > 1]) stats = read_feathers([ src_dir / "clean" / huc2 / f"{barrier_type}_network_stats.feather" for huc2 in group ]).set_index("networkID") # use smaller data types for smaller output files
start = time() groups_df = pd.read_feather(src_dir / "connected_huc2s.feather") region_tiles = [] for group in groups_df.groupby("group").HUC2.apply(set).values: group = sorted(group) print(f"\n\n===========================\nProcessing group {group}") segments = read_feathers( [ src_dir / "clean" / huc2 / "network_segments.feather" for huc2 in group ], columns=[ "lineID", "dams", "small_barriers", ], ).set_index("lineID") # create output files by HUC2 based on where the segments occur for huc2 in group: print(f"----------------------\nProcessing {huc2}") huc2_start = time() huc2_mbtiles_filename = intermediate_dir / f"region{huc2}_networks.mbtiles" region_tiles.append(huc2_mbtiles_filename)
data_dir = Path("data") out_dir = Path("/tmp/sarp") if not out_dir.exists(): os.makedirs(out_dir) huc4_df = pd.read_feather( data_dir / "boundaries/huc4.feather", columns=["HUC2"], ) huc2s = huc4_df.HUC2.unique() df = read_feathers( [ data_dir / "nhd/raw" / huc2 / "nhd_altered_rivers.feather" for huc2 in huc2s ], geo=True, new_fields={"HUC2": huc2s}, ) write_dataframe(df, out_dir / "nhd_altered_rivers.shp") df = read_feathers( [data_dir / "nwi/raw" / huc2 / "altered_rivers.feather" for huc2 in huc2s], geo=True, new_fields={"HUC2": huc2s}, ) write_dataframe(df, out_dir / "nwi_altered_rivers.shp")