def main(in_dir, in_file, out_dir, out_layer): # Main Script to convert raster to points and export to Geofeather # Define output columns out_columns = { 'x': 'x_utm16n', 'y': 'y_utm16n', 'z': 'depth_m', } # Create point geodataframe from raster raster_file_path = os.path.join(in_dir, in_file) print("Processing {}".format(raster_file_path)) start_time = time.time() out_gdf = raster_to_points(raster_file_path, out_columns, type='gdf') print(out_gdf) print("Raster to points conversion time {0}: {1}".format( in_file, time_elapsed(start_time))) # Export final Geodataframe to Geofeather format print("Exporting to Geofeather format") geofeather_path = os.path.join(out_dir, "{}.feather".format(out_layer)) start_time = time.time() to_geofeather(out_gdf, geofeather_path) print("Export execution time for {0}: {1}".format( geofeather_path, time_elapsed(start_time)))
def test_points_read_benchmark(tmpdir, points_wgs84, benchmark): """Test performance of reading feather files""" filename = tmpdir / "points_wgs84.feather" to_geofeather(points_wgs84, filename) benchmark(from_geofeather, filename)
def save_cut_flowlines(out_dir, flowlines, joins, barrier_joins): """Save cut flowline data frames to disk. Parameters ---------- out_dir : str flowlines : GeoDataFrame cut flowlines joins : DataFrame updated joins barrier_joins : DataFrame barrier joins """ print("serializing {:,} cut flowlines...".format(len(flowlines))) start = time() to_geofeather(flowlines.reset_index(drop=True), out_dir / "flowlines.feather") serialize_df(joins, out_dir / "flowline_joins.feather", index=False) serialize_df( barrier_joins.reset_index(drop=True), out_dir / "barrier_joins.feather", index=False, ) print("Done serializing cut flowlines in {:.2f}s".format(time() - start))
def test_points_geofeather_wkt(tmpdir, points_albers_conus_wkt): """Confirm that we can round-trip points to / from feather file with a wkt defined CRS""" filename = tmpdir / "points_albers_conus.feather" to_geofeather(points_albers_conus_wkt, filename) df = from_geofeather(filename) assert_frame_equal(df, points_albers_conus_wkt) assert df.crs == points_albers_conus_wkt.crs
def process_files(in_chmdir, in_chmfiles, in_dtmdir, in_dtmfiles, out_dir, out_layer): ## -------- DTM ------------ # Create point geodataframes from DTM tif files in file list dtm_columns = { 'x': 'x_utm16n', 'y': 'y_utm16n', 'z': 'z_dtm_m', } out_dtm_gdfs = [] for file in in_dtmfiles: print("Processing {}".format(file)) dtm_file_path = os.path.join(in_dtmdir, file) dtm_gdf = raster_to_points(dtm_file_path, dtm_columns, type='gdf') out_dtm_gdfs.append(dtm_gdf) print("{} DTM data frames combined".format(len(out_dtm_gdfs))) # Concat all the point DTM gdfs into a single gdf print("Concatenating DTM geodataframes") concat_dtm_gdf = pd.concat(out_dtm_gdfs, axis=0, ignore_index=True) # Add a unique ID for each point concat_dtm_gdf['gliht_ptidx'] = concat_dtm_gdf.index + 1 concat_dtm_gdf['gliht_ptid'] = concat_dtm_gdf[dtm_columns['x']].astype(str) + "_" + concat_dtm_gdf[dtm_columns['y']].astype(str) print(concat_dtm_gdf) ## --------- CHM ------------ # Create point geodataframes from CHM tif files in file list chm_columns = { 'x': 'x_utm16n', 'y': 'y_utm16n', 'z': 'z_chm_m', } out_chm_dfs = [] for file in in_chmfiles: print("Processing {}".format(file)) chm_file_path = os.path.join(in_chmdir, file) chm_df = raster_to_points(chm_file_path, chm_columns, type='df') out_chm_dfs.append(chm_df) print("{} CHM data frames combined".format(len(out_chm_dfs))) # Concat all the point CHM gdfs into a single gdf print("Concatenating CHM geodataframes") concat_chm_df = pd.concat(out_chm_dfs, axis=0, ignore_index=True) print(concat_chm_df) #----------- Join and Export ------------- # Join the CHM and the DTM GeoDataframes print("Joining CHM and DTM Geodataframes") chm_dtm_gdf = pd.merge(concat_dtm_gdf, concat_chm_df, how='left',on=[dtm_columns['x'], dtm_columns['y']]) print(chm_dtm_gdf) # Export final Geodataframe print("Exporting to Geofeather format") geofeather_path = os.path.join(dest_dir, "{}.feather".format(out_layer)) to_geofeather(chm_dtm_gdf, geofeather_path)
def main(tile, input_pt_feather): # Data Directories source_dir = '/Users/arbailey/natcap/idb/data/source/' data_dir = '/Users/arbailey/natcap/idb/data/work/mangroves' work_dir = os.path.join(data_dir, 'yucatan') pt_data_source = os.path.join(work_dir, input_pt_feather) out_feather_path = os.path.join(work_dir, "gliht_srtm_{}.feather".format(tile)) #--- Load the G-LiHT points print("Loading data from: {}".format(pt_data_source)) start_time = time.time() gliht_pts = from_geofeather(os.path.join(work_dir, pt_data_source)) print("Load time for {0}: {1}".format(pt_data_source, time_elapsed(start_time))) print(gliht_pts.dtypes) print(gliht_pts) #--- SRTM elevation data srtm_source = os.path.join(source_dir, 'srtm/nasa', ".".join( (tile, 'SRTMGL1', 'hgt', 'zip'))) # Clip the points to SRTM raster extent (1 degree tile) # gliht_pts_clip = clip_pts_with_raster(gliht_pts[1:100], srtm_source) # subset for testing gliht_pts_clip = clip_pts_with_raster(gliht_pts, srtm_source) # Sample the SRTM raster gliht_pts_clip = sample_raster(gliht_pts_clip, srtm_source, 'srtm_m') print(gliht_pts_clip.dtypes) print(gliht_pts_clip) # Create unique index value for SRTM raster srtm_unique_source = os.path.join( work_dir, "{}_{}_{}.{}".format(tile, 'srtm', 'uniqueid', 'tif')) make_unique_raster(srtm_source, srtm_unique_source) # Sample Unique ID SRTM raster gliht_pts_clip = sample_raster(gliht_pts_clip, srtm_unique_source, 'srtm_idx') gliht_pts_clip.reset_index(inplace=True) print(gliht_pts_clip.dtypes) print(gliht_pts_clip) # Add columns to show the tile and unique index plus tile gliht_pts_clip['tile'] = tile gliht_pts_clip['tile_srtmidx'] = gliht_pts_clip[ 'tile'] + '_' + gliht_pts_clip['srtm_idx'].astype(str) print(gliht_pts_clip.dtypes) print(gliht_pts_clip) # Export to Feather format print("Exporting to Geofeather format") start_time = time.time() to_geofeather(gliht_pts_clip, out_feather_path) print("Export execution time for {0}: {1}".format( out_feather_path, time_elapsed(start_time)))
def test_polygons_geofeather(tmpdir, polygons_wgs84): """Confirm that we can round-trip polygons to / from feather file""" filename = tmpdir / "polygons_wgs84.feather" to_geofeather(polygons_wgs84, filename) assert os.path.exists(filename) df = from_geofeather(filename) assert_frame_equal(df, polygons_wgs84) assert df.crs == polygons_wgs84.crs
def test_points_geofeather_proj4(tmpdir, points_albers_conus_proj4): """Confirm that we can round-trip points to / from feather file with a proj4 defined CRS""" filename = tmpdir / "points_albers_conus.feather" to_geofeather(points_albers_conus_proj4, filename) df = from_geofeather(filename) assert_frame_equal(df, points_albers_conus_proj4) # equality comparision fails for CRS object constructed from proj4, even though they are still the same if hasattr(df.crs, "to_proj4"): assert df.crs.to_proj4() == points_albers_conus_proj4.crs.to_proj4() else: assert df.crs == points_albers_conus_proj4.crs
def test_missing_crs_warning(tmpdir, points_wgs84): """Confirm that a warning is raised if the crs file is missing""" filename = tmpdir / "points_wgs84.feather" to_geofeather(points_wgs84, filename) os.remove("{}.crs".format(filename)) with pytest.warns(UserWarning) as warning: df = from_geofeather(filename) assert ("coordinate reference system file is missing" in warning[0].message.args[0]) assert df.crs is None
def save_barriers(out_dir, barriers): """Save consolidated barriers to disk for QA. Parameters ---------- out_dir : str barriers : GeoDataFrame """ print("Serializing {:,} barriers...".format(len(barriers))) start = time() tmp = barriers.reset_index(drop=True) to_geofeather(tmp, out_dir / "barriers.feather") to_shp(tmp, out_dir / "barriers.shp") print("Done serializing barriers in {:.2f}s".format(time() - start))
def main(raster_source, uniqueid_file, work_dir, input_pt_feather, out_feather): pt_data_source = os.path.join(work_dir, input_pt_feather) out_feather_path = os.path.join(work_dir, out_feather) #--- Load the points print("Loading data from: {}".format(pt_data_source)) start_time = time.time() in_pts = from_geofeather(os.path.join(work_dir, pt_data_source)) print("Load time for {0}: {1}".format(pt_data_source, time_elapsed(start_time))) print(in_pts.dtypes) print(in_pts) # Clip the points to raster extent # in_pts_clip = clip_pts_with_raster(in_pts[1:100], raster_source) # subset for testing in_pts_clip = clip_pts_with_raster(in_pts, raster_source) # Sample the raster in_pts_clip = sample_raster(in_pts_clip, raster_source, 'tncdep_m') print(in_pts_clip.dtypes) print(in_pts_clip) # Create unique index value for SRTM raster raster_unique_source = os.path.join(work_dir, uniqueid_file) make_unique_raster(raster_source, raster_unique_source) # Sample Unique ID SRTM raster in_pts_clip = sample_raster(in_pts_clip, raster_unique_source, 'tncdep_idx') in_pts_clip.reset_index(inplace=True) print(in_pts_clip.dtypes) print(in_pts_clip) # Export to Feather format print("Exporting to Geofeather format") start_time = time.time() to_geofeather(in_pts_clip, out_feather_path) print("Export execution time for {0}: {1}".format( out_feather_path, time_elapsed(start_time)))
out_dir = boundaries_dir bnd = gp.read_file(boundaries_dir / "SARP_boundary_prj.shp") bnd.sindex ### Process watershed boundaries ### HUC4s that overlap with SARP region is the outer boundary for analysis huc4 = gp.read_file(boundaries_dir / "HUC4_prj.shp") huc4.sindex ### Watersheds ### HUC6s - used for basin names df = gp.read_file(intermediate_dir / "HUC6_prj.shp")[["geometry", "HUC6", "NAME"]] df.sindex to_geofeather(df, out_dir / "HUC6.feather") # Select out within the SARP boundary in_sarp = gp.sjoin(df, bnd) df = df.loc[df.HUC6.isin(in_sarp.HUC6)] to_shp( df.reset_index().rename(columns={ "HUC6": "id", "NAME": "name" }), boundaries_dir / "HUC6_prj.shp", ) ### HUC12s - primary for all spatial joins (other codes can be derived from HUC12) df = gp.read_file(intermediate_dir / "HUC12_prj.shp")[["geometry", "HUC12", "NAME"]]
# select the affected networks idx = network.networkID.isin(cross_region.upstream_network) cut = network.loc[idx].copy() if cut_networks is None: cut_networks = cut else: cut_networks = cut_networks.append(cut, ignore_index=True, sort=False) # write the updated network back out network = network.loc[~idx].copy() print("Serializing updated network...") to_geofeather(network.reset_index(drop=True), out_dir / "network.feather") to_shp(network, out_dir / "network.shp") ### Update new networkID and stats into cut networks cut_networks = (cut_networks[[ "geometry", "networkID" ]].set_index("networkID").join( cross_region.set_index("upstream_network").downstream_network).reset_index( ).rename(columns={"index": "networkID"})) cut_networks.networkID = cut_networks.downstream_network cut_networks = (cut_networks.drop( columns=["downstream_network"]).set_index("networkID").join(network_stats)) ### Read in downstream networks, remove the original networks that are merged above, and append in merged ones for region in cross_region.region.unique(): print(
new_upstreams = (cross_huc_joins.join( joins.set_index("downstream").downstream_id.rename("new_upstream"), on="upstream", ).new_upstream.fillna(0).astype("uint32")) joins.loc[new_upstreams.index, "upstream_id"] = new_upstreams # update new internal joins joins.loc[(joins.type == "huc_in") & (joins.upstream_id != 0), "type"] = "internal" # remove the duplicate downstreams that used to be terminals for their respective HUCs joins = joins.loc[~(joins.upstream.isin(cross_huc_joins.upstream) & (joins.type == "terminal"))] # remove dead ends joins = joins.loc[~((joins.downstream == 0) & (joins.upstream == 0))].copy() print("serializing {:,} flowlines to feather".format(len(flowlines))) to_geofeather(flowlines, region_dir / "flowlines.feather") serialize_df(joins, region_dir / "flowline_joins.feather", index=False) print("serializing {:,} waterbodies to feather".format(len(waterbodies))) to_geofeather(waterbodies, region_dir / "waterbodies.feather") serialize_df(wb_joins, region_dir / "waterbody_flowline_joins.feather", index=False) print("Region done in {:.0f}s".format(time() - region_start)) print("Done in {:.2f}s\n============================".format(time() - start))
"FULLNAME": "Road", "GNIS_NAME": "Stream", "RDXID": "SARPID" }) df.SARPID = df.SARPID.astype("uint") df["id"] = df.index.astype("uint") # Cleanup fields df.Stream = df.Stream.str.strip().fillna("") df.Road = df.Road.str.strip().fillna("") df.loc[(df.Stream.str.strip().str.len() > 0) & (df.Road.str.strip().str.len() > 0), "Name"] = (df.Stream + " / " + df.Road) df.Name = df.Name.fillna("") ### Spatial joins to boundary layers # NOTE: these are used for summary stats, but not used in most of the rest of the stack df = add_spatial_joins(df) ### Spatial joins to protected lands and priority watersheds df = add_protectedland_priorities(df) print("Adding lat / lon fields") df = add_lat_lon(df) to_geofeather(df.reset_index(drop=True), out_dir / "road_crossings.feather") print("Done in {:.2f}".format(time() - start))
### Sanity check if df.groupby(level=0).size().max() > 1: raise ValueError( "Error - there are duplicate barriers in the results. Check uniqueness of IDs and joins." ) ### Output results print("Writing to output files...") # Full results for SARP print("Saving full results to feather") to_geofeather(df.reset_index(), qa_dir / "small_barriers_network_results.feather") # drop geometry, not needed from here on out df = df.drop(columns=["geometry"]) print("Saving full results to CSV") df.to_csv( qa_dir / "small_barriers_network_results.csv", index_label="id", quoting=csv.QUOTE_NONNUMERIC, ) # Drop any fields we don't need for API or tippecanoe # save for API serialize_df(df[SB_API_FIELDS].reset_index(), api_dir / "small_barriers.feather")
PRECIP, POP_DEN, ) def convert_to_df(file_name): with ZipFile(file_name, 'r') as zipObj: listOfFileNames = zipObj.namelist() strip_end = re.search("(.*)_shp.zip",file_name) raw_name = strip_end.group(1) for fileName in listOfFileNames: if "_1" in fileName: zipObj.extract(fileName,'temp_shp') read_fname= pathlib.Path("temp_shp/"+ raw_name+ "_1.shp") if read_fname.exists(): shp=gpd.read_file(read_fname) return shp def concat_function(file_zip): for variable_name in tqdm(file_zip) : shp= convert_to_df(variable_name) appended_data.append(shp) if __name__ == "__main__": dem=Helper.read_files_ini_dir(PROCESSED_DATA_SOURCE+SHAPE) var_name=set(dem) var_name= [item for item in var_name if item.endswith('.zip')] appended_data = [] concat_function(var_name) shp_concat = pd.concat(appended_data).reset_index(drop=True) to_geofeather(shp_concat, PROCESSED_DATA_SOURCES+'Shape_Joined.feather')
def main(tile, input_pt_feather): # Data Directories data_dir = '/Users/arbailey/natcap/idb/data/work/mangroves' work_dir = os.path.join(data_dir, 'yucatan') pt_data_source = os.path.join(work_dir, input_pt_feather) out_feather_path = os.path.join(work_dir, "gliht_srtm_mangroves_{}.feather".format(tile)) # --- Mangrove Max Height raster hmax_source = os.path.join(data_dir, 'gmc_hmax95_bahamas_MAR.tif') hba_source = os.path.join(data_dir, 'gmc_hba95_bahamas_MAR.tif') #--- Load the G-LiHT/SRTM points print("Loading data from: {}".format(pt_data_source)) start_time = time.time() gliht_pts = from_geofeather(os.path.join(work_dir, pt_data_source)) print("Load time for {0}: {1}".format(pt_data_source, time_elapsed(start_time))) gliht_pts.drop(columns=['index'], inplace=True) print(gliht_pts.dtypes) print(gliht_pts) # Sample the Canopy Height rasters # Max Height - hmax95 # gliht_pts = sample_raster(gliht_pts[0:100], hmax_source, 'hmax95') gliht_pts = sample_raster(gliht_pts, hmax_source, 'hmax95') print(gliht_pts.dtypes) print(gliht_pts) # Weighted Average Height - hba95 gliht_pts = sample_raster(gliht_pts, hba_source, 'hba95') print(gliht_pts.dtypes) print(gliht_pts) # # Create unique index value for Canopy raster gmc_unique_source = os.path.join(work_dir, "gmc_uniqueid.tif") # make_unique_raster(hmax_source, gmc_unique_source) # Takes 1:55:14.79 # # Sample Unique ID raster gliht_pts = sample_raster(gliht_pts, gmc_unique_source, 'hmax_idx') # gliht_pts.reset_index(inplace=True) # gliht_pts.drop(columns=['index'], inplace=True) print(gliht_pts.dtypes) print(gliht_pts) # Add columns to show the tile and unique index plus tile gliht_pts['tile'] = tile gliht_pts['tile_hmaxidx'] = gliht_pts['tile'] + '_' + gliht_pts['hmax_idx'].astype(str) print(gliht_pts.dtypes) print(gliht_pts) # Mangrove Extent Vector shapefile paths to join to Points #-- World Atlas of Mangroves wam_path = os.path.join(data_dir, 'wam_Bahamas_MAR.shp') wam_att = 'wam' wam = mangrove_poly_to_gdf(wam_path, wam_att) print(wam) gliht_pts = mangrove_join(gliht_pts, wam) print(gliht_pts) #-- Global Mangrove Watch gmw2016_path = os.path.join(data_dir, 'gmw2016_Bahamas_MAR.shp') gmw2016_att = 'gmw2016' gmw2016 = mangrove_poly_to_gdf(gmw2016_path, gmw2016_att) print(gmw2016) gliht_pts = mangrove_join(gliht_pts, gmw2016) print(gliht_pts) # Global Mangrove Forests gmf_path = os.path.join(data_dir, 'gmf_bahamas_MAR.shp') gmf_att = 'gmf' gmf = mangrove_poly_to_gdf(gmf_path, gmf_att) print(gmf) gliht_pts = mangrove_join(gliht_pts, gmf) print(gliht_pts) # NAtCap Mangrove compilation for MAR region (Mex, Belize, Guatemala, Honduras) ncmar_path = os.path.join(data_dir, 'natcap_mangrovesV4_MAR.shp') ncmar_att = 'ncMAR' ncmar = mangrove_poly_to_gdf(ncmar_path, ncmar_att) print(ncmar) gliht_pts = mangrove_join(gliht_pts, ncmar) print(gliht_pts) print(gliht_pts.dtypes) print(gliht_pts.describe()) # Export to GeoFeather format gliht_pts.reset_index(inplace=True) # get an error from feather export if don't do this # ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s) print(gliht_pts.dtypes) print("Exporting to Geofeather format") start_time = time.time() to_geofeather(gliht_pts, out_feather_path) print("Export execution time for {0}: {1}".format(out_feather_path, time_elapsed(start_time)))
# convert to LineString from MultiLineString idx = df.loc[df.geometry.type == "MultiLineString"].index df.loc[idx, "geometry"] = df.loc[idx].geometry.apply(lambda g: g[0]) df.geometry = df.geometry.apply(to2D) df = df.to_crs(CRS) df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df["HUC2"] = HUC2 if merged is None: merged = df else: merged = merged.append(df, ignore_index=True, sort=False) print("Extracted {:,} NHD lines".format(len(merged))) df = merged.reset_index(drop=True) # add our own ID, df["id"] = df.index.values.copy() df.id = (df.id + 1).astype("uint32") print("Serializing {:,} lines...".format(len(df))) to_geofeather(df, out_dir / "nhd_lines.feather") to_shp(df, out_dir / "nhd_lines.shp") print("Done in {:.2f}s\n============================".format(time() - start))
df = merged print("Projecting dams...") # Drop dams without locations and project df = df.loc[df.geometry.notnull()].copy().to_crs(CRS) print("Merged {:,} dams in SARP states".format(len(df))) missing_sarpid = df.loc[df.SARPID.isnull()] if len(missing_sarpid): print( "--------------------------\nWARNING: {:,} dams are missing SARPID\n----------------------------" .format(len(missing_sarpid))) to_geofeather(df, out_dir / "sarp_dams.feather") ### Download manually snapped dams download_start = time() print("---- Downloading Snapped Dams ----") df = download_fs( SNAPPED_URL, fields=["SARPID", "ManualReview"], token=token, target_wkid=TARGET_WKID, ) print("Projecting manually snapped dams...") df = df.loc[df.geometry.notnull()].to_crs(CRS) print("Downloaded {:,} snapped dams in {:.2f}s".format(len(df),
### Calculate tiers for the region and by state df = calculate_tiers(df, prefix="SE") df = calculate_tiers(df, group_field="State", prefix="State") ### Sanity check if df.groupby(level=0).size().max() > 1: raise ValueError( "Error - there are duplicate barriers in the results. Check uniqueness of IDs and joins." ) ### Output results print("Writing to output files...") # Full results for SARP print("Saving full results to feather") to_geofeather(df.reset_index(), qa_dir / "dams_network_results.feather") # drop geometry, not needed from here on out df = df.drop(columns=["geometry"]) print("Saving full results to CSV") df.to_csv(qa_dir / "dams_network_results.csv", index_label="id", quoting=csv.QUOTE_NONNUMERIC) # save for API serialize_df(df[DAM_API_FIELDS].reset_index(), api_dir / "dams.feather") # Drop fields that can be calculated on frontend keep_fields = [ c for c in DAM_API_FIELDS if not c in {"GainMiles", "TotalNetworkMiles"}
df = gp.read_file(gdb, layer="NHDPoint") df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.loc[df.FType.isin(KEEP_FTYPES)][KEEP_COLS].copy() df.geometry = df.geometry.apply(to2D) df = df.to_crs(CRS) df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df["HUC2"] = HUC2 if merged is None: merged = df else: merged = merged.append(df, ignore_index=True, sort=False) print("Extracted {:,} NHD Points".format(len(merged))) df = merged.reset_index(drop=True) # add our own ID, df["id"] = df.index.values.copy() df.id = (df.id + 1).astype("uint32") print("Serializing {:,} points...".format(len(df))) to_geofeather(df, out_dir / "nhd_points.feather") to_shp(df, out_dir / "nhd_points.shp") print("Done in {:.2f}s\n============================".format(time() - start))
nhd_dir / "clean" / region / "waterbodies.feather" for region in REGION_GROUPS ], src=[region for region in REGION_GROUPS], ) .rename(columns={"src": "region"}) .reset_index(drop=True) ) print("Read {:,} waterbodies".format(len(wb))) # TEMP: can remove on next full run of prepare_flowlines_waterbodies.feather wb.wbID = wb.wbID.astype("uint32") print("Serializing waterbodies...") to_geofeather(wb, out_dir / "waterbodies.feather") print("Reading waterbody drain points...") drains = deserialize_gdfs( [ nhd_dir / "clean" / region / "waterbody_drain_points.feather" for region in REGION_GROUPS ], src=[region for region in REGION_GROUPS], ).reset_index(drop=True) print("Read {:,} waterbody drain points".format(len(drains))) ### Deduplicate and assign to the next segment downstream where there are multiple segments intersecting joins = deserialize_dfs(
df = gp.read_file(gdb, layer="NHDArea") df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.loc[df.FType.isin(KEEP_FTYPES)][KEEP_COLS].copy() df.geometry = df.geometry.apply(to2D) df = df.to_crs(CRS) df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df["HUC2"] = HUC2 if merged is None: merged = df else: merged = merged.append(df, ignore_index=True, sort=False) print("Extracted {:,} NHD areas".format(len(merged))) df = merged.reset_index(drop=True) # add our own ID, df["id"] = df.index.values.copy() df.id = (df.id + 1).astype("uint32") print("Serializing {:,} areas...".format(len(df))) to_geofeather(df, out_dir / "nhd_areas.feather") to_shp(df, out_dir / "nhd_areas.shp") print("Done in {:.2f}s\n============================".format(time() - start))