def process_huc4s(src_dir, huc4s): merged = None for HUC4 in huc4s: print("\n\n------------------- Reading {} -------------------".format( HUC4)) gdb = src_dir / HUC4 / "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4) df = read_dataframe(gdb, layer="NHDPlusCatchment", columns=["NHDPlusID"]) print(f"Read {len(df):,} catchments") df = df.dropna(subset=["NHDPlusID"]) print("Kept {:,} catchments after dropping those without NHDPlusID". format(len(df))) df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.to_crs(CRS) merged = append(merged, df) df = merged # add uniqueID df["catchID"] = df.index.astype("uint32") + 1 # add string version of NHDPlusID df["NHDIDSTR"] = df.NHDPlusID.astype("str") return df
def extract_altered_rivers(gdb_path, target_crs): """Extract NHDArea records that likely indicate altered riverways. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. Returns ------- GeoDataFrame """ df = read_dataframe( gdb_path, layer="NHDArea", columns=COLS, force_2d=True, where=f"FType in {tuple(FTYPES)}", ) df.NHDPlusID = df.NHDPlusID.astype("uint64") df["id"] = df.index.values.astype("uint32") + 1 if len(df): df = df.to_crs(target_crs) df.geometry = make_valid(df.geometry.values.data) return df
def extract_marine(gdb_path, target_crs): """Extract areas from NHDWaterbody and NHDArea that are marine connected. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. Returns ------- GeoDataFrame """ print("Reading marine areas...") area = read_dataframe( gdb_path, layer="NHDArea", columns=COLS, force_2d=True, where=f"FType in {tuple(AREA_FTYPES)}", ) wb = read_dataframe( gdb_path, layer="NHDWaterbody", columns=COLS, force_2d=True, # more complex expression when list is size 1 where=f"FType in ({','.join([str(t) for t in WB_FTYPES])})", ) df = area.append(wb) if len(df): df = explode(df.to_crs(target_crs)) return df
def __call__(self, part): path, row_offset, batch_size = part import pyogrio df = pyogrio.read_dataframe( path, layer=self.layer, columns=self.columns, read_geometry=self.read_geometry, skip_features=row_offset, max_features=batch_size, ) df.index = RangeIndex(row_offset, row_offset + batch_size) return df
def extract_waterbodies(gdb_path, target_crs): """Extract waterbodies from NHDPlusHR data product that are are not one of the excluded types (e.g., estuary, playa, swamp/marsh). Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. Returns ------- GeoDataFrame """ print("Reading waterbodies") df = read_dataframe( gdb_path, layer="NHDWaterbody", columns=[WATERBODY_COLS], force_2d=True, where=f"FType not in {tuple(WATERBODY_EXCLUDE_FTYPES)}", ) print("Read {:,} waterbodies".format(len(df))) # Convert multipolygons to polygons # those we checked that are true multipolygons are errors df.geometry = pg.get_geometry(df.geometry.values.data, 0) df.geometry = make_valid(df.geometry.values.data) print("projecting to target projection") df = df.to_crs(target_crs) df.NHDPlusID = df.NHDPlusID.astype("uint64") df.AreaSqKm = df.AreaSqKm.astype("float32") df.FType = df.FType.astype("uint16") ### Add calculated fields df["wbID"] = df.index.values.astype("uint32") + 1 return df
) raw_data = vrt.read()[0] raw_data = raw_data.astype("uint16") write_raster( src_dir / "midse_raw.tif", raw_data, transform=transform, crs=DATA_CRS, nodata=nodata, ) print("Reclassifying data...") table = read_dataframe(src_dir / "MidSE_2020_CVI.img.vat.dbf") table["priority"] = table.M_SEBcode.astype("uint8") remap_table = table[["Value", "priority"]].values.astype("uint16") data = remap(raw_data, remap_table, nodata=nodata) print("Done reclassifying values") # coerce down to uint8, set nodata to smaller value data[data == nodata] = 255 nodata = 255 # apply mask data = np.where(mask == 1, data, nodata).astype("uint8") write_raster(outfilename,
waterbodies = None rivers = None for huc8 in units[huc2]: print(f"Reading NWI data for {huc8}") filename = src_dir.resolve() / f"{huc8}.zip" if not filename.exists(): print(f"WARNING: {filename} not found") continue # Extract and merge lakes and wetlands df = read_dataframe( f"/vsizip/{filename}/HU8_{huc8}_Watershed/HU8_{huc8}_Wetlands.shp", columns=["ATTRIBUTE", "WETLAND_TY"], where="WETLAND_TY in ('Lake', 'Pond', 'Riverine')", ).rename(columns={ "ATTRIBUTE": "nwi_code", "WETLAND_TY": "nwi_type" }) # some geometries are invalid, filter them out df = df.loc[pg.is_geometry(df.geometry.values.data)].copy() if not len(df): continue df = df.to_crs(CRS) # Mark structurally altered types where # codes with x (excavated), d (ditched), r (artificial substrate), h (diked) # strip any terminal numbers then take last character
def bounds_to_poly(bounds): xmin, ymin, xmax, ymax = bounds return Polygon([[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin], [xmin, ymin]]) def to_titlecase(text): return " ".join([t.capitalize() for t in text.split(" ")]) boundaries_dir = Path("data/boundaries") src_dir = boundaries_dir / "src" ### Process admin boundaries print("Extracting admin boundaries...") us_df = read_dataframe( src_dir / "us_state_wgs84.shp").rename(columns={"NAME": "admin1_name"}) us_df["admin1"] = "US-" + us_df.STUSPS us_df["country"] = "US" ca_df = read_dataframe(src_dir / "canada_province_wgs84.shp").rename( columns={"PRENAME": "admin1_name"}) ca_df["admin1"] = "CA-" + ca_df.PREABBR.str.replace("\.", "") ca_df["country"] = "CA" mx_df = read_dataframe( src_dir / "mexico_state.shp") # already in 4326, but needs to be simplified mx_df.geometry = mx_df.geometry.simplify(0.001) mx_df["admin1"] = "MX-" + mx_df.NUM_EDO mx_df["admin1_name"] = mx_df.ENTIDAD.apply(to_titlecase) mx_df = mx_df.dissolve(by="NUM_EDO")
async def create_custom_report(ctx, zip_filename, dataset, layer, name=""): """Create a Blueprint report for a user-uploaded GIS file contained in a zip. Zip must contain either a shapefile or a file geodatabase. Parameters ---------- ctx : job context zip_filename : str full path to zip filename dataset : str full path to dataset within zip file layer : str name of layer within dataset name : str, optional (default: "") Name of area of interest (included in output report) Returns ------- str path to output file Raises ------ DataError Raised if bounds are too large or if area of interest doesn't overalap SA region """ errors = [] await set_progress(ctx["job_id"], 0, "Loading data") path = f"/vsizip/{zip_filename}/{dataset}" df = read_dataframe(path, layer=layer) geometry = pg.make_valid(df.geometry.values.data) await set_progress(ctx["job_id"], 5, "Preparing area of interest") # dissolve geometry = np.asarray([pg.union_all(geometry)]) geo_geometry = to_crs(geometry, df.crs, GEO_CRS) bounds = pg.total_bounds(geo_geometry) # estimate area extent_area = ( pg.area(pg.box(*pg.total_bounds(to_crs(geometry, df.crs, DATA_CRS)))) * M2_ACRES) if extent_area >= CUSTOM_REPORT_MAX_ACRES: raise DataError( f"The bounding box of your area of interest is too large ({extent_area:,.0f} acres), it must be < {CUSTOM_REPORT_MAX_ACRES:,.0f} acres." ) await set_progress(ctx["job_id"], 10, "Calculating results (this might take a while)") # calculate results, data must be in DATA_CRS print("Calculating results...") results = CustomArea(geometry, df.crs, name).get_results() if results is None: raise DataError( "area of interest does not overlap Southeast Blueprint") if name: results["name"] = name has_urban = "proj_urban" in results and results["proj_urban"][4] > 0 has_slr = "slr" in results has_ownership = "ownership" in results has_protection = "protection" in results # compile indicator IDs across all inputs indicators = [] for input_area in results["inputs"]: for ecosystem in input_area.get("ecosystems", []): indicators.extend([i["id"] for i in ecosystem["indicators"]]) await set_progress(ctx["job_id"], 25, "Creating maps (this might take a while)") print("Rendering maps...") maps, scale, map_errors = await render_maps( bounds, geometry=geo_geometry[0], input_ids=results["input_ids"], indicators=indicators, urban=has_urban, slr=has_slr, ownership=has_ownership, protection=has_protection, ) if map_errors: log.error(f"Map rendering errors: {map_errors}") if "basemap" in map_errors: errors.append("Error creating basemap for all maps") if "aoi" in map_errors: errors.append("Error rendering area of interest on maps") if set(map_errors.keys()).difference(["basemap", "aoi"]): errors.append("Error creating one or more maps") await set_progress(ctx["job_id"], 75, "Creating PDF (this might take a while)", errors=errors) results["scale"] = scale pdf = create_report(maps=maps, results=results) await set_progress(ctx["job_id"], 95, "Nearly done", errors=errors) fp, name = tempfile.mkstemp(suffix=".pdf", dir=TEMP_DIR) with open(fp, "wb") as out: out.write(pdf) await set_progress(ctx["job_id"], 100, "All done!", errors=errors) log.debug(f"Created PDF at: {name}") return name, errors
outer_huc4 = outer_huc4.loc[outer_huc4.HUC4.isin(keep_huc4) & (outer_huc4.clip_km2 >= 2.5)].copy() outer_huc4 = dissolve(outer_huc4, by="HUC4", agg={ "HUC2": "first" }).reset_index(drop=True) outer_huc4.to_feather(out_dir / "outer_huc4.feather") write_dataframe(outer_huc4, out_dir / "outer_huc4.gpkg") ### Counties - within HUC4 bounds print("Processing counties") fips = sorted(state_df.STATEFIPS.unique()) county_df = (read_dataframe( county_filename, columns=["NAME", "GEOID", "STATEFP"], ).to_crs(CRS).rename(columns={ "NAME": "County", "GEOID": "COUNTYFIPS", "STATEFP": "STATEFIPS" })) # keep only those within the region HUC4 outer boundary tree = pg.STRtree(county_df.geometry.values.data) ix = np.unique( tree.query_bulk(huc4_df.geometry.values.data, predicate="intersects")[1]) ix.sort() county_df = county_df.iloc[ix].reset_index(drop=True) county_df.geometry = pg.make_valid(county_df.geometry.values.data) # keep larger set for spatial joins county_df.to_feather(out_dir / "counties.feather")
data = vrt.read()[0] # convert to uint8 data = np.where(data == int(rc.nodata), 255, data) # clip to mask data = np.where(mask == 1, data, 255).astype("uint8") tnc_data = data.copy() # Reclassify to incremental values based on lookup table print("Reclassifying TNC data...") table = read_dataframe( src_dir / "Resilient_and_Connected20180308.tif.vat.dbf", read_geometry=False, columns=["Value"], ) for i, row in table.iterrows(): if i == row.Value: continue data[data == row.Value] = i write_raster(tnc_outfilename, data, transform=transform, crs=DATA_CRS, nodata=255)
def convert_census_gdb( file, year=None, layers=None, level="bg", save_intermediate=True, combine=True, output_dir=".", ): """Convert file geodatabases from Census into (set of) parquet files. Parameters ---------- file : str path to file geodatabase year : str year that the data should be named by. If none, will try to infer from the filename based on convention from the Census Bureau FTP server layers : list, optional set of layers to extract from geodatabase. If none (default), all layers will be extracted level : str, optional geographic level of data ('bg' for blockgroups or 'tr' for tract), by default "bg" save_intermediate : bool, optional if true, each layer will be stored separately as a parquet file, by default True combine : bool, optional whether to store and concatenate intermediate dataframes, default is True output_dir : str, optional path to directory where parquet files will be written, by default "." """ try: import pyogrio as ogr except ImportError: raise ImportError("this function requires the `pyogrio` package\n" "`conda install pyogrio`") if not layers: # grab them all except the metadata year_suffix = file.split(".")[0].split("_")[1][-2:] meta_str = f"{level.upper()}_METADATA_20{year_suffix}" layers = [layer[0] for layer in ogr.list_layers(file)] if meta_str in layers: layers.remove(meta_str) if ( not year ): # make a strong assumption about the name of the file coming from census year = file.split("_")[1] tables = [] for i in layers: print(i) df = ogr.read_dataframe(file, layer=i).set_index("GEOID") if "ACS_" in i: df = gpd.GeoDataFrame(df) else: df = df[df.columns[df.columns.str.contains("e")]] df.columns = pd.Series(df.columns).apply(reformat_acs_vars) df = df.dropna(axis=1, how="all") if combine: tables.append(df) if save_intermediate: df.to_parquet( pathlib.PurePath(output_dir, f"acs_{year}_{i}_{level}.parquet")) if combine: df = pd.concat(tables, axis=1) if f"ACS_{year}_5YR_{level.upper()}" in layers: df = gpd.GeoDataFrame(df) df.to_parquet( pathlib.PurePath(output_dir, f"acs_{year}_{level}.parquet"))
# NOTE: crossings are already de-duplicated against each other and against # barriers crossings = pd.read_feather( src_dir / "road_crossings.feather", columns=["id"] + SUMMARY_UNITS ) # Calculate summary statistics for each type of summary unit # These are joined to vector tiles mbtiles_files = [] for unit in SUMMARY_UNITS: print(f"processing {unit}") if unit == "State": units = read_dataframe( bnd_dir / "region_states.gpkg", columns=["id"], read_geometry=False ).set_index("id") elif unit == "COUNTYFIPS": units = read_dataframe( bnd_dir / "region_counties.gpkg", columns=["id"], read_geometry=False ).set_index("id") else: units = pd.read_feather(bnd_dir / f"{unit}.feather", columns=[unit]).set_index( unit ) dam_stats = ( dams[[unit, "id", "OnNetwork", "Recon"]] .groupby(unit) .agg({"id": "count", "OnNetwork": "sum", "Recon": "sum"}) .rename(
from pathlib import Path import os import warnings from pyogrio import read_dataframe, write_dataframe from analysis.constants import DATA_CRS warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") src_dir = Path("source_data/caribbean") out_dir = Path("data/inputs/indicators/caribbean") tile_dir = Path("data/for_tiles") if not out_dir.exists(): os.makedirs(out_dir) df = (read_dataframe(src_dir / "Watershed_Ranking_PR.shp", columns=["Metric_Ran", "HUC_10"]).rename(columns={ "Metric_Ran": "carrank", "HUC_10": "HUC10" }).to_crs(DATA_CRS)) df.to_feather(out_dir / "caribbean.feather") # for tiles write_dataframe(df[["geometry", "carrank"]], tile_dir / "caribbean.geojson", driver="GeoJSONSeq")
src_dir = barriers_dir / "source" master_dir = barriers_dir / "master" qa_dir = barriers_dir / "qa" network_dir = data_dir / "networks/21/dams" gdb = src_dir / "PR_Dec2019.gdb" dams_layer = "Puerto_Inventory_Dec2019_Indicators" network_layer = "PR_Functional_River_Network" if not os.path.exists(network_dir): os.makedirs(network_dir) start = time() print("Reading Puerto Rico networks...") networks = pio.read_dataframe(gdb, layer=network_layer, as_pygeos=True, columns=[NET_COLS]) src_crs = networks.crs networks = networks.rename(columns={ "batNetID": "networkID", "StreamOrde": "streamorder" }).set_index("networkID") # convert to LineStrings networks.geometry = pg.get_geometry(networks.geometry, 0) # project to crs networks.geometry = to_crs(networks.geometry, src_crs, CRS) networks["length"] = pg.length(networks.geometry) networks["miles"] = networks.length * 0.000621371
data_dir = Path("data") out_dir = data_dir / "boundaries" wbd_gdb = data_dir / "nhd/source/wbd/WBD_National_GDB/WBD_National_GDB.gdb" huc4_df = gp.read_feather(out_dir / "huc4.feather") huc4 = sorted(huc4_df.HUC4.unique()) sarp_huc4_df = gp.read_feather(out_dir / "sarp_huc4.feather") sarp_huc4 = sorted(sarp_huc4_df.HUC4.unique()) ### Extract HUC6 within HUC4 print("Processing HUC6...") huc6_df = (read_dataframe( wbd_gdb, layer="WBDHU6", columns=["huc6", "name"], where=f"SUBSTR(huc6, 0, 4) IN {tuple(huc4)}", ).rename(columns={ "huc6": "HUC6" }).to_crs(CRS)) huc6_df.to_feather(out_dir / "huc6.feather") write_dataframe(huc6_df.rename(columns={"HUC6": "id"}), out_dir / "huc6.gpkg") huc6_df["HUC4"] = huc6_df.HUC6.str[:4] sarp_huc6_df = huc6_df.loc[huc6_df.HUC4.isin(sarp_huc4)].drop(columns=["HUC4"]) write_dataframe(sarp_huc6_df.rename(columns={"HUC6": "id"}), out_dir / "sarp_huc6.gpkg") sarp_huc6_df.to_feather(out_dir / "sarp_huc6.feather") ### Extract HUC8 within HUC4 print("Processing HUC8...") huc8_df = (read_dataframe(
from analysis.constants import CRS from analysis.lib.geometry import dissolve, explode warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") data_dir = Path("data") nhd_dir = data_dir / "nhd/raw" # intentionally use raw flowlines src_dir = data_dir / "states/sc" huc2 = "03" print("Reading waterbodies...") df = (read_dataframe( src_dir / "SCBreakline.gdb", layer="Waterbody", force_2d=True, columns=[], ).rename(columns={ "NAME": "name" }).to_crs(CRS)) print("Reading flowlines...") flowlines = gp.read_feather(nhd_dir / huc2 / "flowlines.feather", columns=[]) tree = pg.STRtree(flowlines.geometry.values.data) print(f"Extracted {len(df):,} SC waterbodies") left, right = tree.query_bulk(df.geometry.values.data, predicate="intersects") df = df.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(df):,} that intersect flowlines") df = explode(df) # make valid
start = time() merged = None for huc2 in units.keys(): print(f"Processing floodplain stats for {huc2}") if huc2 == "02": filename = region02_gdb_filename layer = "Region002_Catchments_Natl_LCStats" else: filename = gdb_filename layer = layers[huc2] df = read_dataframe(filename, layer=layer) df["HUC2"] = huc2 df["NHDPlusID"] = df.NHDIDSTR.astype("uint64") cols = [c for c in df.columns if c.startswith("VALUE_")] natural_cols = [c for c in cols if int(c.split("_")[1]) in NATURAL_TYPES] df["floodplain_km2"] = df[cols].sum(axis=1) * 1e-6 df["nat_floodplain_km2"] = df[natural_cols].sum(axis=1) * 1e-6 merged = append( merged, df[["NHDPlusID", "HUC2", "nat_floodplain_km2", "floodplain_km2"]]) merged.reset_index(drop=True).to_feather(src_dir / "floodplain_stats.feather")
from analysis.constants import MASK_FACTOR from analysis.lib.pygeos_util import explode, to_dict from analysis.lib.raster import add_overviews, create_lowres_mask warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") src_dir = Path("source_data/blueprint") data_dir = Path("data") out_dir = data_dir / "inputs" bnd_dir = data_dir / "boundaries" json_dir = Path("constants") blueprint_filename = out_dir / "se_blueprint2021.tif" df = read_dataframe(src_dir / "SE_Blueprint_2021_Vectors.gdb", layer="InputAreas_SECAS_2021_20211117") # some areas are null inputs, drop them df = df.loc[df.InputOverlapAreasSECAS_InputUsedIn2021.notnull()].copy() # making valid takes a really long time, and probably not necessary df["inputs"] = df.InputOverlapAreasSECAS_InputUsedIn2021.str.lower().apply( lambda x: x.replace("tx chat", "txchat").replace( "ok chat", "okchat").replace(" ", "").replace(";", ",")) # split parts for easier indexing df = explode(df).reset_index() df = df[["inputs", "geometry"]].copy() inputs = df.inputs.unique()
warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") data_dir = Path("data") out_dir = data_dir / "boundaries" ui_dir = Path("ui/data") state_filename = data_dir / "boundaries/source/tl_2019_us_state/tl_2019_us_state.shp" wbd_gdb = data_dir / "nhd/source/wbd/WBD_National_GDB/WBD_National_GDB.gdb" ### Construct region and SARP boundaries from states print("Processing states...") state_df = (read_dataframe( state_filename, columns=["STUSPS", "STATEFP", "NAME"], ).to_crs(CRS).rename(columns={ "STUSPS": "id", "NAME": "State", "STATEFP": "STATEFIPS" })) state_df.geometry = pg.make_valid(state_df.geometry.values.data) # save all states for spatial joins state_df.to_feather(out_dir / "states.feather") state_df = state_df.loc[state_df.id.isin(STATES.keys())].copy() state_df.to_feather(out_dir / "region_states.feather") write_dataframe( state_df[["State", "geometry"]].rename(columns={"State": "id"}), out_dir / "region_states.gpkg", )
out_dir = data_dir / "inputs/indicators/chat" gis_dir = data_dir / "indicators/chat" tile_dir = data_dir / "for_tiles" if not out_dir.exists(): os.makedirs(out_dir) if not gis_dir.exists(): os.makedirs(gis_dir) inputs_df = gp.read_feather(data_dir / "inputs/boundaries/input_areas.feather") # Is in EPSG:5070 but not recognized as such print("Reading CHAT data...") df = (read_dataframe(src_dir / "WAFWA_CHAT_Lower48.shp").set_crs(DATA_CRS).drop( columns=["ls_cond"]).rename(columns=field_map).rename( columns={"chat_rank": "chatrank"})) for col in chat_fields: df[col] = df[col].astype("uint8") df = df.drop(columns=["hexagon_id"]) ### Find the CHAT units that intersect with OK / TX input areas # Use centerpoints, since input area roughly follows edges of hexes points = pg.centroid(df.geometry.values.data) tree = pg.STRtree(points) for state in ["ok", "tx"]: print(f"Processing {state} CHAT...") input_area = pg.union_all(
# else: # merged = merged.append(df, ignore_index=True) # df = merged # df["geometry"] = from_wkb(df.geometry) # read specific states states = ",".join(f"'{s}'" for s in SE_STATES + ["UNKF"]) df = read_dataframe( src_dir / "pad_us2_1.gpkg", columns=[ "Category", "State_Nm", "Own_Type", "GAP_Sts", "Loc_Nm", "Loc_Own", "Agg_Src", ], where=f"State_Nm in ({states})", ) # set the CRS, it is same as 5070 but not recognized properly df = df.set_crs(DATA_CRS) # drop BOEM lease block groups df = df.loc[df.Agg_Src != "USGS_PADUS2_0Marine_BOEM_Block_Dissolve"].drop( columns=["Agg_Src"]) tree = pg.STRtree(df.geometry.values.data)
print("Reading road crossings") # rename columns to match small barriers # NOTE: tiger2020_feature_names is a combination of multiple road names df = read_dataframe( src_dir / "stream_crossings_united_states_feb_2022.gpkg", layer="stream_crossing_sites", columns=[ "stream_crossing_id", "tiger2020_feature_names", "nhdhr_gnis_stream_name", "crossing_type", ], ).rename( columns={ "tiger2020_feature_names": "Road", "nhdhr_gnis_stream_name": "Stream", "stream_crossing_id": "SARPID", "crossing_type": "crossingtype", } ) print(f"Read {len(df):,} road crossings") # project HUC4 to match crossings huc4 = gp.read_feather(boundaries_dir / "huc4.feather", columns=["geometry"]).to_crs( df.crs )
endangered_df = listed_df.loc[listed_df.official_status == "E"].SNAME.unique() threatened_df = listed_df.loc[listed_df.official_status == "T"].SNAME.unique() ### Process trout data (not necessarily T/E/SGCN, just used for filtering) trout = read_dataframe( gdb, layer=trout_layer, read_geometry=False, columns=[ "HUC12_Code", "Species_Name", "Common_Name", "Historical", "Federal_Status", "State_Status", "SGCN_Listing", "Regional_SGCN", ], ).rename( columns={ "HUC12_Code": "HUC12", "Species_Name": "SNAME", "Common_Name": "CNAME", "Federal_Status": "federal", "State_Status": "state", "SGCN_Listing": "sgcn", "Regional_SGCN": "regional", }) # drop Trout-perch trout = trout.loc[trout.CNAME != "Trout-perch"].copy()
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]): """ Extracts flowlines data from NHDPlusHR data product. Extract flowlines from NHDPlusHR data product, joins to VAA table, and filters out coastlines. Extracts joins between flowlines, and filters out coastlines. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. extra_cols: list List of extra field names to extract from NHDFlowline layer Returns ------- tuple of (GeoDataFrame, DataFrame) (flowlines, joins) """ ### Read in flowline data and convert to data frame print("Reading flowlines") flowline_cols = FLOWLINE_COLS + extra_flowline_cols df = read_dataframe( gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols], ) # Index on NHDPlusID for easy joins to other NHD data df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.set_index(["NHDPlusID"], drop=False) # convert MultiLineStrings to LineStrings (all have a single linestring) df.geometry = pg.get_geometry(df.geometry.values.data, 0) print("making valid and projecting to target projection") df.geometry = make_valid(df.geometry.values.data) df = df.to_crs(target_crs) print(f"Read {len(df):,} flowlines") ### Read in VAA and convert to data frame # NOTE: not all records in Flowlines have corresponding records in VAA # we drop those that do not since we need these fields. print("Reading VAA table and joining...") vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS]) vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64") vaa_df = vaa_df.set_index(["NHDPlusID"]) df = df.join(vaa_df, how="inner") print(f"{len(df):,} features after join to VAA") # Simplify data types for smaller files and faster IO df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df.StreamOrde = df.StreamOrde.astype("uint8") df.Slope = df.Slope.astype("float32") df.MinElevSmo = df.MinElevSmo.astype("float32") df.MaxElevSmo = df.MaxElevSmo.astype("float32") ### Read in flowline joins print("Reading flowline joins") join_df = read_dataframe( gdb_path, layer="NHDPlusFlow", read_geometry=False, columns=["FromNHDPID", "ToNHDPID"], ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"}) join_df.upstream = join_df.upstream.astype("uint64") join_df.downstream = join_df.downstream.astype("uint64") ### Fix errors in NHD # some valid joins are marked as terminals (downstream==0) in NHD; we need # to backfill the missing join info. # To do this, we intersect all terminals back with flowlines dropping any # that are themselves terminals. Then we calculate the distance to the upstream # point of the intersected line, and the upstream point of the next segment # downstream. We use the ID of whichever one is closer (must be within 100m). ix = join_df.loc[join_df.downstream == 0].upstream.unique() # get last point, is furthest downstream tmp = df.loc[df.index.isin(ix), ["geometry"]].copy() tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1) target = df.loc[~df.index.isin(ix)] # only search against other flowlines tree = pg.STRtree(target.geometry.values.data) # search within a tolerance of 0.001, these are very very close left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001) pairs = pd.DataFrame( { "left": tmp.index.take(left), "right": target.index.take(right), "source": tmp.geometry.values.data.take(left), # take upstream / downstream points of matched lines "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0), } ) # drop any pairs where the other side is also a terminal (these appear as # V shaped tiny networks that need to be left as is) pairs = pairs.loc[~pairs.right.isin(ix)] # calculate the next segment downstream (only keep the first if multiple; possible logic issue) next_downstream = ( join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)] .groupby("upstream") .downstream.first() ) pairs["next_downstream"] = pairs.right.map(next_downstream) pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point( df.loc[ pairs.loc[pairs.next_downstream.notnull()].next_downstream ].geometry.values.data, 0, ) pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target) ix = pairs.next_downstream.notnull() pairs.loc[ix, "downstream_dist"] = pg.distance( pairs.loc[ix].source, pairs.loc[ix].downstream_target ) # this ignores any nan pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1) # discard any that are too far (>100m) pairs = pairs.loc[pairs.dist <= 100].copy() # sort by distance to upstream point of matched flowline; this allows us # to sort on those then dedup to calculate a new downstream ID for this source line pairs = pairs.sort_values(by=["left", "dist"]) # set the right value to the next downstream if it is closer # this also ignores na ix = pairs.downstream_dist < pairs.upstream_dist pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64") ids = pairs.groupby("left").right.first() if len(ids): # save to send to NHD pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv( f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False ) ix = join_df.upstream.isin(ids.index) join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids) print( f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines" ) # set join types to make it easier to track join_df["type"] = "internal" # set default # upstream-most origin points join_df.loc[join_df.upstream == 0, "type"] = "origin" # downstream-most termination points join_df.loc[join_df.downstream == 0, "type"] = "terminal" ### Filter out coastlines and update joins # WARNING: we tried filtering out pipelines (FType == 428). It doesn't work properly; # there are many that go through dams and are thus needed to calculate # network connectivity and gain of removing a dam. print("Filtering out coastlines...") coastline_idx = df.loc[df.FType == 566].index df = df.loc[~df.index.isin(coastline_idx)].copy() print(f"{len(df):,} features after removing coastlines") # remove any joins that have coastlines as upstream # these are themselves coastline segments join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy() # set the downstream to 0 for any that join coastlines # this will enable us to mark these as downstream terminals in # the network analysis later join_df["marine"] = join_df.downstream.isin(coastline_idx) join_df.loc[join_df.marine, "downstream"] = 0 join_df.loc[join_df.marine, "type"] = "terminal" # drop any duplicates (above operation sets some joins to upstream and downstream of 0) join_df = join_df.drop_duplicates(subset=["upstream", "downstream"]) ### Filter out underground connectors ix = df.loc[df.FType == 420].index print("Removing {:,} underground conduits".format(len(ix))) df = df.loc[~df.index.isin(ix)].copy() join_df = remove_joins( join_df, ix, downstream_col="downstream", upstream_col="upstream" ) ### Label loops for easier removal later # WARNING: loops may be very problematic from a network processing standpoint. # Include with caution. print("Identifying loops") df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull()) idx = df.loc[df.loop].index join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx) ### Add calculated fields # Set our internal master IDs to the original index of the file we start from # Assume that we can always fit into a uint32, which is ~400 million records # and probably bigger than anything we could ever read in df["lineID"] = df.index.values.astype("uint32") + 1 join_df = ( join_df.join(df.lineID.rename("upstream_id"), on="upstream") .join(df.lineID.rename("downstream_id"), on="downstream") .fillna(0) ) for col in ("upstream", "downstream"): join_df[col] = join_df[col].astype("uint64") for col in ("upstream_id", "downstream_id"): join_df[col] = join_df[col].astype("uint32") ### Calculate size classes print("Calculating size class") drainage = df.TotDASqKm df.loc[drainage < 10, "sizeclass"] = "1a" df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b" df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2" df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a" df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b" df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4" df.loc[drainage >= 25000, "sizeclass"] = "5" # Calculate length and sinuosity print("Calculating length and sinuosity") df["length"] = df.geometry.length.astype("float32") df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32") # drop columns not useful for later processing steps df = df.drop(columns=["FlowDir", "StreamCalc"]) # calculate incoming joins (have valid upstream, but not in this HUC4) join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in" return df, join_df
CRS, DAM_FS_COLS, ) warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*") data_dir = Path("data") boundaries_dir = data_dir / "boundaries" src_dir = data_dir / "barriers/source" # note: drop date fields, they have bogus values anyway nid = ( read_dataframe( src_dir / "NID_2021.gdb", ) .to_crs(CRS) .drop(columns=["EditorDate", "NextFollowUpDate"]) .set_index("NIDID") ) cols = [c for c in nid.columns if c in DAM_FS_COLS] nid = nid[cols + ["geometry"]].rename(columns={"SARPUniqueID": "SARPID"}) nid["ManualReview"] = 0 # load previously snapped dams prev = gp.read_feather( src_dir / "manually_snapped_dams.feather", ) prev.ManualReview = prev.ManualReview.astype("uint8") prev = prev.loc[prev.ManualReview.isin([4, 5, 13])].set_index("SARPID")
def read_file(path, npartitions=None, chunksize=None, layer=None, columns=None, **kwargs): """ Read a GIS file into a Dask GeoDataFrame. This function requires `pyogrio <https://github.com/geopandas/pyogrio/>`__. Parameters ---------- path : str The absolute or relative path to the file or URL to be opened. npartitions : int, optional The number of partitions to create. Either this or `chunksize` should be specified. chunksize : int, optional The number of rows per partition to use. Either this or `npartitions` should be specified. layer : int or str, optional (default: first layer) If an integer is provided, it corresponds to the index of the layer with the data source. If a string is provided, it must match the name of the layer in the data source. Defaults to first layer in data source. columns : list-like, optional (default: all columns) List of column names to import from the data source. Column names must exactly match the names in the data source, and will be returned in the order they occur in the data source. To avoid reading any columns, pass an empty list-like. """ try: import pyogrio except ImportError as err: raise ImportError( "The 'read_file' function requires the 'pyogrio' package, but it is " "not installed or does not import correctly." f"\nImporting pyogrio resulted in: {str(err)}") from dask.layers import DataFrameIOLayer # TODO smart inference for a good default partition size ? if (npartitions is None) == (chunksize is None): raise ValueError( "Exactly one of npartitions and chunksize must be specified.") if "skip_features" in kwargs or "max_features" in kwargs: # TODO we currently use those keywords already for reading in each # partition (we would need to take those into account for determining # the part start/ends) raise ValueError( "The 'skip_features'/'max_feature' keywords are not yet supported") if kwargs: raise ValueError("Additional pyogrio keywords are not yet supported") total_size = pyogrio.read_info(path, layer=layer)["features"] if chunksize is None: chunksize = int(ceil(total_size / npartitions)) # TODO this could be inferred from read_info ? read_geometry = True if columns is not None and "geometry" not in columns: read_geometry = False meta = pyogrio.read_dataframe(path, layer=layer, columns=columns, read_geometry=read_geometry, max_features=5) # Define parts parts = [] row_offset = 0 divs = [row_offset] while row_offset < total_size: batch_size = min(chunksize, total_size - row_offset) parts.append((path, row_offset, batch_size)) row_offset += batch_size divs.append(row_offset) # Set the last division value to be the largest index value in the last partition divs[-1] = divs[-1] - 1 # Create Blockwise layer label = "read-file-" output_name = label + tokenize(path, chunksize, layer, columns) layer = DataFrameIOLayer( output_name, columns, parts, FileFunctionWrapper(layer, columns), label=label, ) graph = HighLevelGraph({output_name: layer}, {output_name: set()}) return new_dd_object(graph, output_name, meta, divs)
nodata = 255 print("Reading and warping Nature's Network...") with rasterio.open(src_dir / "NaturesNetwork_conservdesign_180625.tif") as src: # note: raster does not have nodata set; 0 indicates NODATA (outside extent) # and 0 values data = extract_window(src, window, transform, nodata=nodata) # apply input area mask data = np.where(mask == 1, data, nodata).astype("uint8") print("Reclassifying data...") # Remap the raw values to priorities and categories table = read_dataframe( src_dir / "NaturesNetwork_conservdesign_180625.tif.vat.dbf", columns=["Value", "Priority", "Descrpt"], read_geometry=False, ) table = table.loc[table.Value > 0].copy() table.Priority = table.Priority.astype("uint8") table["category"] = table.Descrpt.str[0].astype("uint8") remap_table = table[["Value", "Priority"]].values.astype("uint8") priority_data = remap(data, remap_table, nodata=nodata) write_raster(outfilename, priority_data, transform, DATA_CRS, nodata=nodata) print("Adding overviews and masks...") add_overviews(outfilename) create_lowres_mask(
tile_dir = data_dir / "for_tiles" input_area_mask = data_dir / "inputs/input_areas_mask.tif" if not analysis_dir.exists(): os.makedirs(analysis_dir) bnd_df = gp.read_feather(data_dir / "inputs/boundaries/se_boundary.feather") bnd = bnd_df.geometry.values.data[0] ### Extract HUC12 within boundary print("Reading source HUC12s...") merged = None for huc2 in [2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 21]: df = read_dataframe( src_dir / f"summary_units/huc12/WBD_{huc2:02}_HU2_GDB/WBD_{huc2:02}_HU2_GDB.gdb", layer="WBDHU12", )[["huc12", "name", "geometry"]].rename(columns={"huc12": "id"}) if merged is None: merged = df else: merged = merged.append(df, ignore_index=True) print("Projecting to match SE region data...") huc12 = merged.to_crs(DATA_CRS) # select out those within the SE states print("Selecting HUC12s in region...")