def test_get_parts(geom): expected_num_parts = pygeos.get_num_geometries(geom) expected_parts = pygeos.get_geometry(geom, range(0, expected_num_parts)) parts = pygeos.get_parts(geom) assert len(parts) == expected_num_parts assert np.all(pygeos.equals_exact(parts, expected_parts))
def close_gaps(df, tolerance): """Close gaps in LineString geometry where it should be contiguous. Snaps both lines to a centroid of a gap in between. """ geom = df.geometry.values.data coords = pygeos.get_coordinates(geom) indices = pygeos.get_num_coordinates(geom) # generate a list of start and end coordinates and create point geometries edges = [0] i = 0 for ind in indices: ix = i + ind edges.append(ix - 1) edges.append(ix) i = ix edges = edges[:-1] points = pygeos.points(np.unique(coords[edges], axis=0)) buffered = pygeos.buffer(points, tolerance) dissolved = pygeos.union_all(buffered) exploded = [ pygeos.get_geometry(dissolved, i) for i in range(pygeos.get_num_geometries(dissolved)) ] centroids = pygeos.centroid(exploded) snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance) return snapped
def explode(df): """Explodes multipart geometries to single parts. Attributes are copied to each individual geometry. NOTE: Faster method not yet supported in pygeos, in https://github.com/pygeos/pygeos/pull/130 This branch must be checked out and built for this functionality. Parameters ---------- df : GeoDataFrame Returns ------- GeoDataFrame """ # Fast method: # ix, parts = pg.get_parts(df.geometry.values.data) # series = pd.Series(parts, index=df.index[ix], name="geometry") # return df.drop(columns=["geometry"]).join(series) # Slower method geometries = df.geometry.values.data ix = [] parts = [] for i in range(len(df)): num_parts = pg.get_num_geometries(geometries[i]) ix.extend(np.repeat(df.index[i], num_parts)) parts.extend(pg.get_geometry(geometries[i], range(num_parts))) return gp.GeoDataFrame({ "geometry": parts }, index=ix, crs=df.crs).join(df.drop(columns=["geometry"]))
def time_get_parts_python(self): """Python / ufuncs version of get_parts""" parts = [] for i in range(len(self.multipolygons)): num_parts = pygeos.get_num_geometries(self.multipolygons[i]) parts.append(pygeos.get_geometry(self.multipolygons[i], range(num_parts))) parts = np.concatenate(parts)
def test_get_parts_array(): # note: this also verifies that None is handled correctly # in the mix; internally it returns -1 for count of geometries geom = np.array([None, empty_line_string, multi_point, point, multi_polygon]) expected_parts = [] for g in geom: for i in range(0, pygeos.get_num_geometries(g)): expected_parts.append(pygeos.get_geometry(g, i)) parts = pygeos.get_parts(geom) assert len(parts) == len(expected_parts) assert np.all(pygeos.equals_exact(parts, expected_parts))
def test_get_parts_geometry_collection_multi(): """On the first pass, the individual Multi* geometry objects are returned from the collection. On the second pass, the individual singular geometry objects within those are returned. """ geom = pygeos.geometrycollections([multi_point, multi_line_string, multi_polygon]) expected_num_parts = pygeos.get_num_geometries(geom) expected_parts = pygeos.get_geometry(geom, range(0, expected_num_parts)) parts = pygeos.get_parts(geom) assert len(parts) == expected_num_parts assert np.all(pygeos.equals_exact(parts, expected_parts)) expected_subparts = [] for g in np.asarray(expected_parts): for i in range(0, pygeos.get_num_geometries(g)): expected_subparts.append(pygeos.get_geometry(g, i)) subparts = pygeos.get_parts(parts) assert len(subparts) == len(expected_subparts) assert np.all(pygeos.equals_exact(subparts, expected_subparts))
def test_get_parts_return_index(): geom = np.array([multi_point, point, multi_polygon]) expected_parts = [] expected_index = [] for i, g in enumerate(geom): for j in range(0, pygeos.get_num_geometries(g)): expected_parts.append(pygeos.get_geometry(g, j)) expected_index.append(i) parts, index = pygeos.get_parts(geom, return_index=True) assert len(parts) == len(expected_parts) assert np.all(pygeos.equals_exact(parts, expected_parts)) assert np.array_equal(index, expected_index)
def close_gaps(gdf, tolerance): """Close gaps in LineString geometry where it should be contiguous. Snaps both lines to a centroid of a gap in between. Parameters ---------- gdf : GeoDataFrame, GeoSeries GeoDataFrame or GeoSeries containing LineString representation of a network. tolerance : float nodes within a tolerance will be snapped together Returns ------- GeoSeries See also -------- momepy.extend_lines momepy.remove_false_nodes """ geom = gdf.geometry.values.data coords = pygeos.get_coordinates(geom) indices = pygeos.get_num_coordinates(geom) # generate a list of start and end coordinates and create point geometries edges = [0] i = 0 for ind in indices: ix = i + ind edges.append(ix - 1) edges.append(ix) i = ix edges = edges[:-1] points = pygeos.points(np.unique(coords[edges], axis=0)) buffered = pygeos.buffer(points, tolerance / 2) dissolved = pygeos.union_all(buffered) exploded = [ pygeos.get_geometry(dissolved, i) for i in range(pygeos.get_num_geometries(dissolved)) ] centroids = pygeos.centroid(exploded) snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance) return gpd.GeoSeries(snapped, crs=gdf.crs)
def explode(series): """Convert multipart geometries to a list of geometries Parameters ---------- series : Series Returns ------- Series """ return series.apply( lambda g: [pg.get_geometry(g, i) for i in range(0, pg.get_num_geometries(g))] )
def extract_waterbodies(gdb_path, target_crs): """Extract waterbodies from NHDPlusHR data product that are are not one of the excluded types (e.g., estuary, playa, swamp/marsh). Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. Returns ------- GeoDataFrame """ print("Reading waterbodies") df = read_dataframe( gdb_path, layer="NHDWaterbody", columns=[WATERBODY_COLS], force_2d=True, where=f"FType not in {tuple(WATERBODY_EXCLUDE_FTYPES)}", ) print("Read {:,} waterbodies".format(len(df))) # Convert multipolygons to polygons # those we checked that are true multipolygons are errors df.geometry = pg.get_geometry(df.geometry.values.data, 0) df.geometry = make_valid(df.geometry.values.data) print("projecting to target projection") df = df.to_crs(target_crs) df.NHDPlusID = df.NHDPlusID.astype("uint64") df.AreaSqKm = df.AreaSqKm.astype("float32") df.FType = df.FType.astype("uint16") ### Add calculated fields df["wbID"] = df.index.values.astype("uint32") + 1 return df
def to_dict(geometry): """Convert pygeos Geometry object to a dictionary representation. Equivalent to structure of GeoJSON. Parameters ---------- geometry : pygeos Geometry object (singular) Returns ------- dict GeoJSON dict representation of geometry """ geometry = pg.normalize(geometry) def get_ring_coords(polygon): # outer ring must be reversed to be counterclockwise[::-1] coords = [pg.get_coordinates(pg.get_exterior_ring(polygon)).tolist()] for i in range(pg.get_num_interior_rings(polygon)): # inner rings must be reversed to be clockwise[::-1] coords.append( pg.get_coordinates(pg.get_interior_ring(polygon, i)).tolist()) return coords geom_type = GEOJSON_TYPE[pg.get_type_id(geometry)] coords = [] if geom_type == "MultiPolygon": coords = [] geoms = pg.get_geometry(geometry, range(pg.get_num_geometries(geometry))) for geom in geoms: coords.append(get_ring_coords(geom)) elif geom_type == "Polygon": coords = get_ring_coords(geometry) else: raise NotImplementedError("Not built") return {"type": geom_type, "coordinates": coords}
def _clean_multi_geometries(object_array): """Cleanup a sequence of geometries to remove multi geometries. Args: object_array (numpy.ndarray): The object array to cleanup Returns: numpy.ndarray: Cleaned-up object array. """ # Handle multi-geometries geometries = object_array[:, 0] num_geometries = get_num_geometries(geometries) for index in np.nonzero(num_geometries > 1)[0]: split_geometries = [ np.concatenate((get_geometry(geometries[index], i), object_array[index, 1:])) for i in range(num_geometries[index]) ] object_array[index] = split_geometries[0] object_array = np.concatenate((object_array, split_geometries[1:, :])) return geometries
def occult(lines: LineCollection, tolerance: float) -> LineCollection: """ Remove occulted lines. The order of the geometries in 'lines' matters, see example below. 'tolerance' controls the distance tolerance between the first and last points of a geometry to consider it closed. Examples: $ vpype line 0 0 5 5 rect 2 2 1 1 occult show # line is occulted by rect $ vpype rect 2 2 1 1 line 0 0 5 5 occult show # line is NOT occulted by rect, as the line is drawn after the rectangle. """ line_arr = np.array( [pygeos.linestrings(list(zip(line.real, line.imag))) for line in lines] ) for i, line in enumerate(line_arr): coords = pygeos.get_coordinates(line) if math.hypot(coords[-1, 0] - coords[0, 0], coords[-1, 1] - coords[0, 1]) < tolerance: tree = pygeos.STRtree(line_arr[:i]) p = pygeos.polygons(coords) geom_idx = tree.query(p, predicate="intersects") line_arr[geom_idx] = pygeos.set_operations.difference(line_arr[geom_idx], p) new_lines = LineCollection() for geom in line_arr: for i in range(pygeos.get_num_geometries(geom)): coords = pygeos.get_coordinates(pygeos.get_geometry(geom, i)) new_lines.append(coords[:, 0] + coords[:, 1] * 1j) return new_lines
def test_shared_paths_linestring(): g1 = pygeos.linestrings([(0, 0), (1, 0), (1, 1)]) g2 = pygeos.linestrings([(0, 0), (1, 0)]) actual1 = pygeos.shared_paths(g1, g2) assert pygeos.equals(pygeos.get_geometry(actual1, 0), g2)
ix = pg.intersects(dams.geometry.values.data, last_pt) dams.loc[ix, "pt"] = last_pt[ix] # override with upstream most point when both intersect first_pt = pg.get_point(dams.flowline.values.data, 0) ix = pg.intersects(dams.geometry.values.data, first_pt) dams.loc[ix, "pt"] = first_pt[ix] ix = dams.pt.isnull() # WARNING: this might fail for odd intersection geoms; we always take the first line # below pt = pd.Series( pg.get_point( pg.get_geometry( pg.intersection( dams.loc[ix].geometry.values.data, dams.loc[ix].flowline.values.data ), 0, ), 0, ), index=dams.loc[ix].index, ).dropna() dams.loc[pt.index, "pt"] = pt # Few should be dropped at this point, since all should have overlapped at least by a point errors = dams.pt.isnull() if errors.max(): print( f"{errors.sum():,} dam / flowline joins could not be represented as points and were dropped" )
def test_get_geometry_collection(geom): n = pygeos.get_num_geometries(geom) actual = pygeos.get_geometry(geom, [0, -n, n, -(n + 1)]) assert pygeos.equals(actual[0], actual[1]).all() assert pygeos.is_missing(actual[2:4]).all()
def test_get_geometry_simple(geom): actual = pygeos.get_geometry(geom, [0, -1, 1, -2]) assert pygeos.equals(actual[0], actual[1]).all() assert pygeos.is_missing(actual[2:4]).all()
os.makedirs(network_dir) start = time() print("Reading Puerto Rico networks...") networks = pio.read_dataframe(gdb, layer=network_layer, as_pygeos=True, columns=[NET_COLS]) src_crs = networks.crs networks = networks.rename(columns={ "batNetID": "networkID", "StreamOrde": "streamorder" }).set_index("networkID") # convert to LineStrings networks.geometry = pg.get_geometry(networks.geometry, 0) # project to crs networks.geometry = to_crs(networks.geometry, src_crs, CRS) networks["length"] = pg.length(networks.geometry) networks["miles"] = networks.length * 0.000621371 # sinuosity of each segment networks["sinuosity"] = calculate_sinuosity(networks.geometry) # aggregate up to the network network_length = networks.groupby(level=0)[["length"]].sum() temp_df = networks[["length", "sinuosity"]].join(network_length, rsuffix="_total") # Calculate length-weighted sinuosity
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]): """ Extracts flowlines data from NHDPlusHR data product. Extract flowlines from NHDPlusHR data product, joins to VAA table, and filters out coastlines. Extracts joins between flowlines, and filters out coastlines. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. extra_cols: list List of extra field names to extract from NHDFlowline layer Returns ------- tuple of (GeoDataFrame, DataFrame) (flowlines, joins) """ ### Read in flowline data and convert to data frame print("Reading flowlines") flowline_cols = FLOWLINE_COLS + extra_flowline_cols df = read_dataframe( gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols], ) # Index on NHDPlusID for easy joins to other NHD data df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.set_index(["NHDPlusID"], drop=False) # convert MultiLineStrings to LineStrings (all have a single linestring) df.geometry = pg.get_geometry(df.geometry.values.data, 0) print("making valid and projecting to target projection") df.geometry = make_valid(df.geometry.values.data) df = df.to_crs(target_crs) print(f"Read {len(df):,} flowlines") ### Read in VAA and convert to data frame # NOTE: not all records in Flowlines have corresponding records in VAA # we drop those that do not since we need these fields. print("Reading VAA table and joining...") vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS]) vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64") vaa_df = vaa_df.set_index(["NHDPlusID"]) df = df.join(vaa_df, how="inner") print(f"{len(df):,} features after join to VAA") # Simplify data types for smaller files and faster IO df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df.StreamOrde = df.StreamOrde.astype("uint8") df.Slope = df.Slope.astype("float32") df.MinElevSmo = df.MinElevSmo.astype("float32") df.MaxElevSmo = df.MaxElevSmo.astype("float32") ### Read in flowline joins print("Reading flowline joins") join_df = read_dataframe( gdb_path, layer="NHDPlusFlow", read_geometry=False, columns=["FromNHDPID", "ToNHDPID"], ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"}) join_df.upstream = join_df.upstream.astype("uint64") join_df.downstream = join_df.downstream.astype("uint64") ### Fix errors in NHD # some valid joins are marked as terminals (downstream==0) in NHD; we need # to backfill the missing join info. # To do this, we intersect all terminals back with flowlines dropping any # that are themselves terminals. Then we calculate the distance to the upstream # point of the intersected line, and the upstream point of the next segment # downstream. We use the ID of whichever one is closer (must be within 100m). ix = join_df.loc[join_df.downstream == 0].upstream.unique() # get last point, is furthest downstream tmp = df.loc[df.index.isin(ix), ["geometry"]].copy() tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1) target = df.loc[~df.index.isin(ix)] # only search against other flowlines tree = pg.STRtree(target.geometry.values.data) # search within a tolerance of 0.001, these are very very close left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001) pairs = pd.DataFrame( { "left": tmp.index.take(left), "right": target.index.take(right), "source": tmp.geometry.values.data.take(left), # take upstream / downstream points of matched lines "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0), } ) # drop any pairs where the other side is also a terminal (these appear as # V shaped tiny networks that need to be left as is) pairs = pairs.loc[~pairs.right.isin(ix)] # calculate the next segment downstream (only keep the first if multiple; possible logic issue) next_downstream = ( join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)] .groupby("upstream") .downstream.first() ) pairs["next_downstream"] = pairs.right.map(next_downstream) pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point( df.loc[ pairs.loc[pairs.next_downstream.notnull()].next_downstream ].geometry.values.data, 0, ) pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target) ix = pairs.next_downstream.notnull() pairs.loc[ix, "downstream_dist"] = pg.distance( pairs.loc[ix].source, pairs.loc[ix].downstream_target ) # this ignores any nan pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1) # discard any that are too far (>100m) pairs = pairs.loc[pairs.dist <= 100].copy() # sort by distance to upstream point of matched flowline; this allows us # to sort on those then dedup to calculate a new downstream ID for this source line pairs = pairs.sort_values(by=["left", "dist"]) # set the right value to the next downstream if it is closer # this also ignores na ix = pairs.downstream_dist < pairs.upstream_dist pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64") ids = pairs.groupby("left").right.first() if len(ids): # save to send to NHD pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv( f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False ) ix = join_df.upstream.isin(ids.index) join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids) print( f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines" ) # set join types to make it easier to track join_df["type"] = "internal" # set default # upstream-most origin points join_df.loc[join_df.upstream == 0, "type"] = "origin" # downstream-most termination points join_df.loc[join_df.downstream == 0, "type"] = "terminal" ### Filter out coastlines and update joins # WARNING: we tried filtering out pipelines (FType == 428). It doesn't work properly; # there are many that go through dams and are thus needed to calculate # network connectivity and gain of removing a dam. print("Filtering out coastlines...") coastline_idx = df.loc[df.FType == 566].index df = df.loc[~df.index.isin(coastline_idx)].copy() print(f"{len(df):,} features after removing coastlines") # remove any joins that have coastlines as upstream # these are themselves coastline segments join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy() # set the downstream to 0 for any that join coastlines # this will enable us to mark these as downstream terminals in # the network analysis later join_df["marine"] = join_df.downstream.isin(coastline_idx) join_df.loc[join_df.marine, "downstream"] = 0 join_df.loc[join_df.marine, "type"] = "terminal" # drop any duplicates (above operation sets some joins to upstream and downstream of 0) join_df = join_df.drop_duplicates(subset=["upstream", "downstream"]) ### Filter out underground connectors ix = df.loc[df.FType == 420].index print("Removing {:,} underground conduits".format(len(ix))) df = df.loc[~df.index.isin(ix)].copy() join_df = remove_joins( join_df, ix, downstream_col="downstream", upstream_col="upstream" ) ### Label loops for easier removal later # WARNING: loops may be very problematic from a network processing standpoint. # Include with caution. print("Identifying loops") df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull()) idx = df.loc[df.loop].index join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx) ### Add calculated fields # Set our internal master IDs to the original index of the file we start from # Assume that we can always fit into a uint32, which is ~400 million records # and probably bigger than anything we could ever read in df["lineID"] = df.index.values.astype("uint32") + 1 join_df = ( join_df.join(df.lineID.rename("upstream_id"), on="upstream") .join(df.lineID.rename("downstream_id"), on="downstream") .fillna(0) ) for col in ("upstream", "downstream"): join_df[col] = join_df[col].astype("uint64") for col in ("upstream_id", "downstream_id"): join_df[col] = join_df[col].astype("uint32") ### Calculate size classes print("Calculating size class") drainage = df.TotDASqKm df.loc[drainage < 10, "sizeclass"] = "1a" df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b" df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2" df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a" df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b" df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4" df.loc[drainage >= 25000, "sizeclass"] = "5" # Calculate length and sinuosity print("Calculating length and sinuosity") df["length"] = df.geometry.length.astype("float32") df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32") # drop columns not useful for later processing steps df = df.drop(columns=["FlowDir", "StreamCalc"]) # calculate incoming joins (have valid upstream, but not in this HUC4) join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in" return df, join_df
### Aggregate waterbodies that are in contact / overlapping each other waterbodies = from_geofeather(src_dir / region / "waterbodies.feather").set_index("wbID") wb_joins = deserialize_df(src_dir / region / "waterbody_flowline_joins.feather") print("Read {:,} waterbodies and {:,} flowine / waterbody joins".format( len(waterbodies), len(wb_joins))) # TODO: remove this on next full rerun of extract_flowlines... waterbodies = waterbodies.drop(columns=["hash"], errors="ignore") # Convert multipolygons to single part poylgons idx = ( pg.get_type_id(waterbodies.geometry) == 6 ) # idx = waterbodies.loc[waterbodies.geometry.type == "MultiPolygon"].index waterbodies.loc[idx, "geometry"] = waterbodies.loc[idx].geometry.apply( lambda g: pg.get_geometry(g, 0)) # raise min size waterbodies = waterbodies.loc[ waterbodies.AreaSqKm >= WATERBODY_MIN_SIZE].copy() wb_joins = wb_joins.loc[wb_joins.wbID.isin(waterbodies.index)].copy() # End TODO: # Drop any waterbodies and waterbody joins to flowlines that are no longer present # based on above processing of flowlines wb_joins = wb_joins.loc[wb_joins.lineID.isin(flowlines.index)].copy() to_drop = ~waterbodies.index.isin(wb_joins.wbID) print( "Dropping {:,} waterbodies that no longer intersect with the flowlines retained above" .format(to_drop.sum()))