def export_duplicate_areas(dups, path): """Export duplicate barriers for QA. Parameters ---------- dups : GeoDataFrame contains "geometry" and "dup_group" to indicate group path : str or Path output path """ print("Exporting duplicate areas") dups = dups.copy() dups["geometry"] = pg.buffer(dups.geometry.values.data, dups.dup_tolerance) dissolved = dissolve(dups[["geometry", "dup_group"]], by="dup_group") groups = gp.GeoDataFrame( dups[["id", "SARPID", "dup_group"]] .groupby("dup_group") .agg({"SARPID": "unique", "id": "unique"}) .join(dissolved.geometry, on="dup_group"), crs=dups.crs, ) groups["id"] = groups.id.apply(lambda x: ", ".join([str(s) for s in x])) groups["SARPID"] = groups.SARPID.apply(lambda x: ", ".join([str(s) for s in x])) write_dataframe(groups, path)
def export_snap_dist_lines(df, original_locations, out_dir, prefix=""): """Creates lines from the original coordinate to the snapped coordinate to help QA/QC snapping operation. Creates geopackages in out_dir: - pre_snap_to_post_snap: line between snapped and unsnapped coordinate - pre_snap: unsnapped points - post_snap: snapped points Parameters ---------- df : DataFrame contains pygeos geometries in "geometry" column original_locations : DataFrame contains pygeos geometries in "geometry" column out_dir : Path prefix : str prefix to add to filename """ print("Exporting snap review datasets...") tmp = df.loc[ df.snapped, ["geometry", "Name", "SARPID", "snapped", "snap_dist", "snap_log" ]].join(original_locations.geometry.rename("orig_pt")) tmp["new_pt"] = tmp.geometry.copy() tmp["geometry"] = connect_points(tmp.new_pt.values.data, tmp.orig_pt.values.data) write_dataframe( tmp.drop(columns=["new_pt", "orig_pt"]).reset_index(drop=True), out_dir / f"{prefix}pre_snap_to_post_snap.fgb", ) write_dataframe( tmp.drop(columns=["geometry", "new_pt"]).rename(columns={ "orig_pt": "geometry" }).reset_index(drop=True), out_dir / f"{prefix}pre_snap.fgb", ) write_dataframe( tmp.drop(columns=["geometry", "orig_pt"]).rename(columns={ "new_pt": "geometry" }).reset_index(drop=True), out_dir / f"{prefix}post_snap.fgb", )
outer_huc4["km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6 # calculate geometric difference, explode, and keep non-slivers outer_huc4["geometry"] = pg.difference(outer_huc4.geometry.values.data, state_merged) outer_huc4 = explode(outer_huc4) outer_huc4["clip_km2"] = pg.area(outer_huc4.geometry.values.data) / 1e6 outer_huc4["percent"] = 100 * outer_huc4.clip_km2 / outer_huc4.km2 keep_huc4 = outer_huc4.loc[outer_huc4.clip_km2 >= 100].HUC4.unique() outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4) & (outer_huc4.clip_km2 >= 2.5)].copy() outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={ "HUC2": "first" }).reset_index(drop=True) outer_huc4.to_feather(out_dir / "outer_huc4.feather") write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg") ### Counties - within HUC4 bounds print("Processing counties") fips = sorted(state_df.STATEFIPS.unique()) county_df = (read_dataframe( county_filename, columns=["NAME", "GEOID", "STATEFP"], ).to_crs(CRS).rename(columns={ "NAME": "County", "GEOID": "COUNTYFIPS", "STATEFP": "STATEFIPS" })) # keep only those within the region HUC4 outer boundary
### Add lat / lon (must be done after snapping!) print("Adding lat / lon fields") geo = df[["geometry"]].to_crs(GEO_CRS) geo["lat"] = pg.get_y(geo.geometry.values.data).astype("float32") geo["lon"] = pg.get_x(geo.geometry.values.data).astype("float32") df = df.join(geo[["lat", "lon"]]) ### All done processing! print("\n--------------\n") df = df.reset_index(drop=True) print("Serializing {:,} dams to master file".format(len(df))) df.to_feather(master_dir / "dams.feather") write_dataframe(df, qa_dir / "dams.fgb") # Extract out only the snapped ones df = df.loc[df.snapped & (~(df.duplicate | df.dropped | df.excluded))].reset_index( drop=True ) df.lineID = df.lineID.astype("uint32") df.NHDPlusID = df.NHDPlusID.astype("uint64") print("Serializing {:,} snapped dams".format(len(df))) df[ ["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "intermittent"] ].to_feather(snapped_dir / "dams.feather",) write_dataframe(df, qa_dir / "snapped_dams.fgb")
regional_connectors_df, ]): tree = pg.STRtree(df.geometry.values.data) ix = tree.query(bnd, predicate="intersects") df = df.iloc[ix].copy() df["value"] = i + 1 if i == 0: merged = df else: merged = merged.append(df, ignore_index=True, sort=False) df = merged if DEBUG: write_dataframe(df, "/tmp/naturescape.gpkg", driver="GPKG") ### Create cores raster # cores and important areas don't overlap; just rasterize them print("Rasterizing cores and other important areas...") cores = np.ones(shape=data.shape, dtype="uint8") * 255 # set all areas inside the mask as 0 cores = np.where(mask == 1, 0, 255) for value in [1, 2, 3]: print(f"Processing value {value}...") shapes = to_dict_all(df.loc[df.value == value].geometry.values.data) data = rasterize(shapes, data.shape, transform=transform,
# set the CRS, it is same as 5070 but not recognized properly df = df.set_crs(DATA_CRS) # drop BOEM lease block groups df = df.loc[df.Agg_Src != "USGS_PADUS2_0Marine_BOEM_Block_Dissolve"].drop( columns=["Agg_Src"]) tree = pg.STRtree(df.geometry.values.data) ix = tree.query(bnd_df.geometry.values.data[0], predicate="intersects") df = df.iloc[ix].copy() print("making valid...") df["geometry"] = pg.make_valid(df.geometry.values.data) df = explode(df).reset_index() # there are some geometry errors after cleaning up above, keep only polys df = df.loc[pg.get_type_id(df.geometry.values.data) == 3].copy() print("Writing files") df.to_feather(out_dir / "ownership.feather") write_dataframe(df, data_dir / "boundaries/ownership.gpkg", driver="GPKG") # Write for tiles print("Writing GeoJSON for tiles") write_dataframe( df[["geometry", "Own_Type", "GAP_Sts"]].to_crs(GEO_CRS), tile_dir / "ownership.geojson", driver="GeoJSONSeq", )
{"break_geometry": breaks.take(left)}, index=df.index.take(right) ) grouped = pairs.groupby(level=0).break_geometry.apply( lambda g: pg.multipolygons(g.values.data) ) df.loc[grouped.index, "geometry"] = pg.difference( df.loc[grouped.index].geometry.values.data, grouped.values ) df = explode(df).reset_index(drop=True) # make sure all polygons are valid ix = ~pg.is_valid(df.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum()} invalid waterbodies") df.loc[ix, "geometry"] = pg.make_valid(df.loc[ix].geometry.values.data) df = explode(explode(df)) df = df.loc[pg.get_type_id(df.geometry.values.data) == 3].reset_index() # assign a new unique wbID df["wbID"] = df.index.values.astype("uint32") + 1 + int(huc2) * 1000000 df["km2"] = pg.area(df.geometry.values.data) / 1e6 df.to_feather(huc2_dir / "waterbodies.feather") write_dataframe(df, huc2_dir / "waterbodies.gpkg") print("--------------------") print(f"HUC2: {huc2} done in {time() - huc2_start:.0f}s\n\n") print(f"Done in {time() - start:.2f}s\n============================")
from pathlib import Path import os import warnings from pyogrio import read_dataframe, write_dataframe from analysis.constants import DATA_CRS warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") src_dir = Path("source_data/caribbean") out_dir = Path("data/inputs/indicators/caribbean") tile_dir = Path("data/for_tiles") if not out_dir.exists(): os.makedirs(out_dir) df = (read_dataframe(src_dir / "Watershed_Ranking_PR.shp", columns=["Metric_Ran", "HUC_10"]).rename(columns={ "Metric_Ran": "carrank", "HUC_10": "HUC10" }).to_crs(DATA_CRS)) df.to_feather(out_dir / "caribbean.feather") # for tiles write_dataframe(df[["geometry", "carrank"]], tile_dir / "caribbean.geojson", driver="GeoJSONSeq")
nhd_dir = data_dir / "nhd/raw" out_dir = Path("/tmp/sarp") huc2_df = pd.read_feather(data_dir / "boundaries/huc2.feather", columns=["HUC2"]) huc2s = huc2_df.HUC2.sort_values().values huc2s = [ # "02", # "03", # "05", # "06", # "07", # "08", # "09", # "10", # "11", # "12", # "13", "14", "15", "16", "17", # "21", ] for huc2 in huc2s: print(f"Exporting {huc2}...") flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather") write_dataframe(flowlines, out_dir / f"region{huc2}_raw_flowlines.shp")
df["joinID"] = (df.index * 1e6).astype("uint32") df["kind"] = "crossing" merged = barriers[["kind", "geometry"]].append(df[["joinID", "kind", "geometry"]], sort=False, ignore_index=True) merged = mark_duplicates(merged, tolerance=DUPLICATE_TOLERANCE) dup_groups = merged.loc[(merged.dup_count > 1) & (merged.kind == "barrier")].dup_group.unique() remove_ids = merged.loc[merged.dup_group.isin(dup_groups) & (merged.kind == "crossing")].joinID print( f"{len(remove_ids):,} crossings appear to be duplicates of existing barriers" ) df = df.loc[~df.joinID.isin(remove_ids)].drop(columns=["joinID", "kind"]) print(f"Now have {len(df):,} road crossings") # make sure that id is unique of small barriers df["id"] = (barriers.id.max() + 100000 + df.index.astype("uint")).astype("uint") df = df.reset_index(drop=True) df.to_feather(out_dir / "road_crossings.feather") write_dataframe(df, qa_dir / "road_crossings.fgb") print("Done in {:.2f}".format(time() - start))
columns=[ "lineID", "geometry", "intermittent", "altered", "sizeclass", "StreamOrde", ], ).set_index("lineID").rename(columns={"StreamOrde": "streamorder"})) flowlines = flowlines.join(segments) # aggregate to multilinestrings by combinations of networkID, altered, intermittent networks = (aggregate_lines( flowlines, by=["networkID", "altered", "intermittent"]).set_index("networkID").join( stats, how="inner").reset_index().sort_values(by="networkID")) # Set plotting symbol networks["symbol"] = "normal" networks.loc[networks.altered, "symbol"] = "altered" # currently overrides altered since both come from NHD (mutually exclusive in source data) networks.loc[networks.intermittent, "symbol"] = "intermittent" networks.loc[networks.intermittent & networks.altered, "symbol"] = "altered_intermittent" print("Serializing dissolved networks...") write_dataframe( networks, out_dir / f"region{huc2}_{barrier_type}_networks.{ext}")
### Join to states states = gp.read_feather( boundaries_dir / "states.feather", columns=["id", "geometry"] ).rename(columns={"id": "state"}) states = states.loc[states.state.isin(STATES.keys())].copy() print("Joining to states...") tree = pg.STRtree(drains.geometry.values.data) left, right = tree.query_bulk(states.geometry.values.data, predicate="intersects") tmp = ( pd.DataFrame( {"state": states.state.values.take(left)}, index=drains.index.values.take(right) ) .groupby(level=0) .first() ) # drop any without states (e.g., outside region states) drains = drains.join(tmp, how="inner") # cleanup datatypes for col in ["wb_km2", "TotDASqKm"]: drains[col] = drains[col].astype("float32") print("Saving to shapefile") write_dataframe(drains, out_dir / "unclaimed_drain_points.shp")
# exclude altered flowlines at low zooms subset = subset.loc[subset.mapcode < 2].copy() if simplification: subset["geometry"] = pg.simplify(subset.geometry.values.data, simplification) json_filename = tmp_dir / f"region{huc2}_{minzoom}_{maxzoom}_flowlines.json" mbtiles_filename = ( tmp_dir / f"region{huc2}_{minzoom}_{maxzoom}_flowlines.mbtiles") mbtiles_files.append(mbtiles_filename) write_dataframe( subset.to_crs(GEO_CRS), json_filename, driver="GeoJSONSeq", ) del subset ret = subprocess.run( tippecanoe_args + ["-Z", str(minzoom), "-z", str(maxzoom)] + ["-o", f"{str(mbtiles_filename)}", str(json_filename)] + col_types) ret.check_returncode() # remove JSON file json_filename.unlink()
print("----- {} ------".format(huc2)) huc2_dir = out_dir / huc2 if not huc2_dir.exists(): os.makedirs(huc2_dir) huc4s = units[huc2] if len(huc4s) > MAX_HUC4s: # split into smaller groups to keep output files smaller for counter, i in enumerate(range(0, len(huc4s), MAX_HUC4s)): print(f"Processing subgroup {huc2}_{counter}") df = process_huc4s(src_dir, huc4s[i:i + MAX_HUC4s]) print("serializing {:,} catchments".format(len(df))) df[["NHDPlusID", "geometry" ]].to_feather(huc2_dir / f"catchments_{counter}.feather") write_dataframe(df, tmp_dir / f"region{huc2}_catchments_{counter}.shp") else: df = process_huc4s(src_dir, huc4s) print("serializing {:,} catchments".format(len(df))) df[["NHDPlusID", "geometry"]].to_feather(huc2_dir / f"catchments.feather") write_dataframe(df, tmp_dir / f"region{huc2}_catchments.shp") print("Done in {:.2f}s\n============================".format(time() - start))
inputs.sort() inputs = pd.DataFrame({"inputs": inputs}) inputs.reset_index().rename(columns={ "index": "value", "inputs": "id" }).to_json(json_dir / "input_area_values.json", orient="records") df = df.join( inputs.reset_index().rename(columns={ "index": "value" }).set_index("inputs"), on="inputs", ) write_dataframe(df, bnd_dir / "input_areas.fgb") df.to_feather(out_dir / "boundaries/input_areas.feather") # Rasterize to match the blueprint df = pd.DataFrame(df[["geometry", "value"]].copy()) df.geometry = df.geometry.values.data # convert to pairs of GeoJSON , value shapes = df.apply(lambda row: (to_dict(row.geometry), row.value), axis=1) print("Rasterizing inputs...") with rasterio.open(blueprint_filename) as src: data = rasterize(shapes.values, src.shape, transform=src.transform,
mx_df = mx_df.dissolve(by="NUM_EDO") mx_df["country"] = "MX" admin_df = (us_df[["geometry", "admin1", "admin1_name", "country"]].append( ca_df[["geometry", "admin1", "admin1_name", "country"]], ignore_index=True, sort=False, ).append( mx_df[["geometry", "admin1", "admin1_name", "country"]], ignore_index=True, sort=False, )) admin_df["id"] = admin_df.index.astype("uint8") + 1 write_dataframe(admin_df, boundaries_dir / "na_admin1.json") admin_df.to_feather(boundaries_dir / "na_admin1.feather") ### Process species ranges print("Processing species ranges...") # create lookup of species scientific name to code sci_name_lut = {value["SNAME"]: key for key, value in SPECIES.items()} range_df = read_dataframe("data/boundaries/src/species_ranges.shp") # split hoary bat into Hawaiian vs mainland laci = range_df.loc[range_df.SCI_NAME == "Lasiurus cinereus"] Hawaii = pg.box(*HAWAII_BOUNDS) haba = laci.copy() # add new geometry for haba haba.geometry = pg.intersection(laci.geometry.values.data, Hawaii)
# make valid ix = ~pg.is_valid(waterbodies.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum():,} invalid waterbodies") waterbodies.loc[ix, "geometry"] = pg.make_valid( waterbodies.loc[ix].geometry.values.data) # note: nwi_code, nwi_type are discarded here since they aren't used later print("Dissolving adjacent waterbodies") waterbodies = dissolve(waterbodies, by=["altered"]) waterbodies = explode(waterbodies).reset_index(drop=True) waterbodies["km2"] = pg.area(waterbodies.geometry.values.data) / 1e6 waterbodies.to_feather(huc2_dir / "waterbodies.feather") write_dataframe(waterbodies, huc2_dir / "waterbodies.gpkg") ### Process riverine print(f"Extracted {len(rivers):,} NWI altered river polygons") left, right = tree.query_bulk(rivers.geometry.values.data, predicate="intersects") rivers = rivers.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(rivers):,} that intersect flowlines") rivers = explode(rivers) # make valid ix = ~pg.is_valid(rivers.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum():,} invalid rivers") rivers.loc[ix, "geometry"] = pg.make_valid( rivers.loc[ix].geometry.values.data)
layer="Waterbody", force_2d=True, columns=[], ).rename(columns={ "NAME": "name" }).to_crs(CRS)) print("Reading flowlines...") flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather", columns=[]) tree = pg.STRtree(flowlines.geometry.values.data) print(f"Extracted {len(df):,} SC waterbodies") left, right = tree.query_bulk(df.geometry.values.data, predicate="intersects") df = df.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(df):,} that intersect flowlines") df = explode(df) # make valid ix = ~pg.is_valid(df.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum():,} invalid waterbodies") df.loc[ix, "geometry"] = pg.make_valid(df.loc[ix].geometry.values.data) print("Dissolving adjacent waterbodies...") df["tmp"] = 1 df = dissolve(df, by="tmp").drop(columns=["tmp"]) df = explode(df).reset_index(drop=True) df.to_feather(src_dir / "sc_waterbodies.feather") write_dataframe(df, src_dir / "sc_waterbodies.gpkg")
state_filename, columns=["STUSPS", "STATEFP", "NAME"], ).to_crs(CRS).rename(columns={ "STUSPS": "id", "NAME": "State", "STATEFP": "STATEFIPS" })) state_df.geometry = pg.make_valid(state_df.geometry.values.data) # save all states for spatial joins state_df.to_feather(out_dir / "states.feather") state_df = state_df.loc[state_df.id.isin(STATES.keys())].copy() state_df.to_feather(out_dir / "region_states.feather") write_dataframe( state_df[["State", "geometry"]].rename(columns={"State": "id"}), out_dir / "region_states.gpkg", ) # dissolve to create outer state boundary for total analysis area and regions bnd_df = gp.GeoDataFrame( [ { "geometry": pg.union_all(state_df.geometry.values.data), "id": "total" }, ] + [{ "geometry": pg.union_all(state_df.loc[state_df.id.isin( REGION_STATES[region])].geometry.values.data), "id": region,
huc4 = sorted(huc4_df.HUC4.unique()) sarp_huc4_df = gp.read_feather(out_dir / "sarp_huc4.feather") sarp_huc4 = sorted(sarp_huc4_df.HUC4.unique()) ### Extract HUC6 within HUC4 print("Processing HUC6...") huc6_df = (read_dataframe( wbd_gdb, layer="WBDHU6", columns=["huc6", "name"], where=f"SUBSTR(huc6, 0, 4) IN {tuple(huc4)}", ).rename(columns={ "huc6": "HUC6" }).to_crs(CRS)) huc6_df.to_feather(out_dir / "huc6.feather") write_dataframe(huc6_df.rename(columns={"HUC6": "id"}), out_dir / "huc6.gpkg") huc6_df["HUC4"] = huc6_df.HUC6.str[:4] sarp_huc6_df = huc6_df.loc[huc6_df.HUC4.isin(sarp_huc4)].drop(columns=["HUC4"]) write_dataframe(sarp_huc6_df.rename(columns={"HUC6": "id"}), out_dir / "sarp_huc6.gpkg") sarp_huc6_df.to_feather(out_dir / "sarp_huc6.feather") ### Extract HUC8 within HUC4 print("Processing HUC8...") huc8_df = (read_dataframe( wbd_gdb, layer="WBDHU8", columns=["huc8", "name"], where=f"SUBSTR(huc8, 0, 4) IN {tuple(huc4)}", ).rename(columns={
})).join(snapped_df[["geometry", "ManualReview"]].rename( columns={ "geometry": "reviewed_pt", "ManualReview": "reviewed_ManualReview" })) # mark any as snapped / not df.loc[df.snapped, "cur_pt_src"] = "autosnapped pt" df.loc[~df.snapped, "cur_pt_src"] = "not snapped" # any that were manually reviewed in source, use that original point and manual review # (note: these may be overridden again by manually reviewed dams in snapping dataset) ix = df.src_ManualReview.notnull() df.loc[ix, "geometry"] = df.loc[ix].src_pt df.loc[ix, "ManualReview"] = df.loc[ix].src_ManualReview df.loc[ix, "cur_pt_src"] = "raw inventory pt" # any that were manually reviewed in snapping dataset, use that coordinate and manual review ix = df.reviewed_ManualReview.notnull() df.loc[ix, "geometry"] = df.loc[ix].reviewed_pt df.loc[ix, "ManualReview"] = df.loc[ix].reviewed_ManualReview df.loc[ix, "cur_pt_src"] = "manually reviewed pt" df = df.drop(columns=[ "src_pt", "reviewed_pt", "src_ManualReview", "reviewed_ManualReview" ]).reset_index() df.to_feather("data/barriers/qa/snapping_dataset_2022.feather") write_dataframe(df, out_dir / "snapping_dataset_2022.shp")
if state == "tx": state_df = state_df.drop(columns=["lcon"]) # Reclassify chatrank to match blueprint integration rules. # First shift other values up one ix = state_df.chatrank >= 2 state_df.loc[ix, "chatrank"] = state_df.chatrank + 1 # for any that were previously a chatrank of 2 but with higher values of aquatic or # terrestrial, map them back to 2 ix = ((state_df.chatrank == 3) & state_df.arank.isin([2, 3]) & state_df.trank.isin([2, 3])) state_df.loc[ix, "chatrank"] = 2 # for proofing write_dataframe(state_df, gis_dir / f"{state}chat.gpkg", driver="GPKG") # for tiles write_dataframe(state_df, tile_dir / f"{state}chat.geojson", driver="GeoJSONSeq") # convert attributes to categoricals for analysis for col in chat_fields: if col not in state_df.columns: continue if state == "tx" and col == "chatrank": # IMPORTANT: value 2 is split into values 2/3 and others are shifted up by 1 categories = [0, 1, 2, 3, 4, 5, 6, 7] else:
barriers = read_feathers( [barriers_dir / f"{kind}s.feather" for kind in kinds], geo=True, new_fields={"kind": kinds}, ) for kind, init_id in zip(kinds, kind_ids): ix = barriers.kind == kind barriers.loc[ix, "barrierID"] = barriers.loc[ix].id + init_id barriers.barrierID = barriers.barrierID.astype("uint64") barriers.to_feather(out_dir / "all_barriers.feather") if DEBUG: write_dataframe(barriers, out_dir / "all_barriers.fgb") ### Cut flowlines in each HUC2 for huc2 in huc2s: region_start = time() print(f"----- {huc2} ------") huc2_dir = out_dir / huc2 if not huc2_dir.exists(): os.makedirs(huc2_dir) # drop any barriers on loops in this region huc2_barriers = barriers.loc[(barriers.HUC2 == huc2) & (~barriers.loop)].set_index("barrierID", drop=False)
columns=["id", "geometry"]) tree = pg.STRtree(df.geometry.values.data) left, right = tree.query_bulk(states.geometry.values.data, predicate="intersects") state_join = (pd.DataFrame({ "state": states.id.take(left), "drain": df.index.take(right) }).groupby("drain").first()) df = df.join(state_join) # only keep those in the region states df = df.loc[df.state.isin(STATES)].copy() write_dataframe(df.loc[~has_dam], out_dir / "estimated_dam_lines.fgb") write_dataframe(df.loc[~has_dam], tmp_dir / "estimated_dam_lines.shp") write_dataframe(df.loc[has_dam], out_dir / "estimated_dam_lines_with_dam.fgb") write_dataframe(df.loc[has_dam], tmp_dir / "estimated_dam_lines_with_dam.shp") df = (df.join(drains.geometry.rename("drain"), on="drainID").set_geometry("drain").drop(columns=["geometry"])) write_dataframe(df.loc[~has_dam], out_dir / "estimated_dams.fgb") write_dataframe(df.loc[~has_dam], tmp_dir / "estimated_dams.shp") write_dataframe(df.loc[has_dam], out_dir / "estimated_dams_with_dam.fgb") write_dataframe(df.loc[has_dam], tmp_dir / "estimated_dams_with_dam.shp") print(f"Total elapsed {time() - start:,.2f}s")
warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") data_dir = Path("data") huc2s = sorted( pd.read_feather(data_dir / "boundaries/huc2.feather", columns=["HUC2"]).HUC2.values) src_dir = Path("data/nhd/raw") out_dir = Path("data/nhd/merged") if not out_dir.exists(): os.makedirs(out_dir) for group in ["points", "lines", "poly"]: merged = None for huc2 in huc2s: huc2_dir = src_dir / huc2 filename = huc2_dir / f"nhd_{group}.feather" if filename.exists(): df = gp.read_feather(filename) merged = append(merged, df) df = merged.reset_index(drop=True) df.to_feather(out_dir / f"nhd_{group}.feather") write_dataframe(df, out_dir / f"nhd_{group}.gpkg")
print(df.groupby("loop").size()) ### Add lat / lon print("Adding lat / lon fields") geo = df[["geometry"]].to_crs(GEO_CRS) geo["lat"] = pg.get_y(geo.geometry.values.data).astype("float32") geo["lon"] = pg.get_x(geo.geometry.values.data).astype("float32") df = df.join(geo[["lat", "lon"]]) print("\n--------------\n") df = df.reset_index(drop=True) print("Serializing {:,} small barriers".format(len(df))) df.to_feather(master_dir / "small_barriers.feather") write_dataframe(df, qa_dir / "small_barriers.fgb") # Extract out only the snapped ones df = df.loc[df.snapped & (~(df.duplicate | df.dropped | df.excluded))].reset_index( drop=True ) df.lineID = df.lineID.astype("uint32") df.NHDPlusID = df.NHDPlusID.astype("uint64") print("Serializing {:,} snapped small barriers".format(len(df))) df[ ["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "intermittent"] ].to_feather(snapped_dir / "small_barriers.feather",) write_dataframe(df, qa_dir / "snapped_small_barriers.fgb")
) ### cleanup fields df["SourceState"] = df.SARPID.str[:2] for column in ("River", "NIDID", "Source", "SourceDBID", "Name", "OtherName"): df[column] = df[column].fillna("").str.strip() for column in ( "Construction", "Condition", "Purpose", "Recon", "PassageFacility", "BarrierStatus", "ManualReview", ): df[column] = df[column].fillna(0).astype("uint8") for column in ("Year", "Height", "Length", "Feasibility"): df[column] = df[column].fillna(0).astype("uint16") s = df.groupby("SARPID").size() if s.max() > 1: raise ValueError(f"Multiple dams with same SARPID: {s[s > 1].index}") df.to_feather(src_dir / "dams_outer_huc4.feather") write_dataframe(df, src_dir / "dams_outer_huc4.gpkg")
} ) ) flowlines = flowlines.join(segments).join(floodplains, on="NHDPlusID") flowlines["length"] = flowlines["length_m"] / 1000.0 for col in ["natfldpln", "fldkm2", "natfldkm2"]: flowlines[col] = flowlines[col].fillna(-1) for col in ["interm", "altered"]: flowlines[col] = flowlines[col].astype("uint8") # serialize raw segments print("Serializing undissolved networks...") write_dataframe( flowlines.reset_index(), out_dir / f"region{huc2}_{barrier_type}_segments.{ext}", ) # aggregate to multilinestrings by combinations of networkID print("Dissolving networks...") networks = ( merge_lines(flowlines[["networkID", "geometry"]], by=["networkID"]) .set_index("networkID") .join(stats, how="inner") .reset_index() ) # this currently takes a very long time for shapefiles on GDAL3.4.x due to large multilinestrings # so write to GPKG and convert to shapefile using Docker GDAL 3.3.x print("Serializing dissolved networks...") write_dataframe(
pd.read_feather( data_dir / "boundaries/huc4.feather", columns=["HUC2"] ).HUC2.unique() ) ### Merge NHD lines and areas that represent dams and dam-related features print("Reading NHD points, lines, and areas, and merging...") nhd_pts = read_feathers( [raw_dir / huc2 / "nhd_points.feather" for huc2 in huc2s], geo=True, new_fields={"HUC2": huc2s}, ) nhd_pts = nhd_pts.loc[nhd_pts.FType.isin([343])].copy() # write original points for SARP write_dataframe(nhd_pts, out_dir / "nhd_dam_pts_nhdpoint.fgb") nhd_pts["source"] = "NHDPoint" # create circular buffers to merge with others nhd_pts["geometry"] = pg.buffer(nhd_pts.geometry.values.data, 5) nhd_lines = read_feathers( [raw_dir / huc2 / "nhd_lines.feather" for huc2 in huc2s], geo=True, new_fields={"HUC2": huc2s}, ) nhd_lines = nhd_lines.loc[ (nhd_lines.FType.isin([343, 369, 398])) & nhd_lines.geometry.notnull() ].reset_index(drop=True)
percent_overlap = calculate_percent_overlap( input_area_mask, [to_dict(geometry)], bounds=pg.total_bounds(geometry) ) if percent_overlap < 50: drop_ids.append(id) print(f"Dropping {len(drop_ids)} HUC12s that do not sufficiently overlap input areas") huc12 = huc12.loc[~huc12.id.isin(drop_ids)].copy() # extract geographic bounds huc12_wgs84 = huc12.to_crs(GEO_CRS) huc12 = huc12.join(huc12_wgs84.bounds) # Save in EPSG:5070 for analysis huc12.to_feather(analysis_dir / "huc12.feather") write_dataframe(huc12, bnd_dir / "huc12.gpkg") ### Marine units print("Reading marine blocks...") atl = read_dataframe( src_dir / "summary_units/marine_blocks/Atlantic/ATL_BLKCLP.shp", columns=["PROT_NUMBE", "BLOCK_NUMB"], ) gulf = read_dataframe( src_dir / "summary_units/marine_blocks/Gulf_of_Mexico/blk_clip.shp", columns=["PROT_NUMBE", "BLOCK_NUMB"], ) marine = atl.append(gulf, ignore_index=True)