def add_distances(network): #Find crs of current gdf and arbitrary point(lat,lon) for new crs current_crs = "epsg:4326" #The commented out crs does not work in all cases #current_crs = [*network.edges.crs.values()] #current_crs = str(current_crs[0]) print(network.nodes.iloc[0]) print(network.nodes) lat = pygeom.get_y(network.nodes['geometry'].iloc[0]) lon = pygeom.get_x(network.nodes['geometry'].iloc[0]) # formula below based on :https://gis.stackexchange.com/a/190209/80697 approximate_crs = "epsg:" + str( int(32700 - np.round((45 + lat) / 90, 0) * 100 + np.round((183 + lon) / 6, 0))) #from pygeos/issues/95 geometries = network.edges['geometry'] coords = pygeos.get_coordinates(geometries) transformer = pyproj.Transformer.from_crs(current_crs, approximate_crs, always_xy=True) new_coords = transformer.transform(coords[:, 0], coords[:, 1]) result = pygeos.set_coordinates(geometries.copy(), np.array(new_coords).T) dist = pygeos.length(result) edges = network.edges.copy() edges['distance'] = dist return Network(nodes=network.nodes, edges=edges)
def _dense_point_array(self, geoms, distance, index): """ geoms - array of pygeos lines """ # interpolate lines to represent them as points for Voronoi points = np.empty((0, 2)) ids = [] if pygeos.get_type_id(geoms[0]) not in [1, 2, 5]: lines = pygeos.boundary(geoms) else: lines = geoms lengths = pygeos.length(lines) for ix, line, length in zip(index, lines, lengths): if length > distance: # some polygons might have collapsed pts = pygeos.line_interpolate_point( line, np.linspace(0.1, length - 0.1, num=int((length - 0.1) // distance)), ) # .1 offset to keep a gap between two segments points = np.append(points, pygeos.get_coordinates(pts), axis=0) ids += [ix] * len(pts) return points, ids
def calculate_sinuosity(geometries): """Calculate sinuosity of the line. This is the length of the line divided by the distance between the endpoints of the line. By definition, it is always >=1. Parameters ---------- geometries : Series or ndarray of pygeos geometries Returns ------- Series or ndarray sinuosity values """ # By definition, sinuosity should not be less than 1 first = pg.get_point(geometries, 0) last = pg.get_point(geometries, -1) straight_line_distance = pg.distance(first, last) sinuosity = np.ones((len(geometries), )).astype("float32") # if there is no straight line distance there can be no sinuosity ix = straight_line_distance > 0 # by definition, all values must be at least 1, so clip lower bound sinuosity[ix] = (pg.length(geometries[ix]) / straight_line_distance).clip(1) if isinstance(geometries, pd.Series): return pd.Series(sinuosity, index=geometries.index) return sinuosity
def find_dam_faces(drains, waterbodies): # drop any on large size classes; these do not reliably pick up actual dams correctly drains = drains.loc[drains.sizeclass.isin(["1a", "1b", "2"])] # convert to plain dataframe joined = pd.DataFrame(drains[["geometry", "wbID"]].join( waterbodies.geometry.rename("waterbody"), on="wbID", )) joined["geometry"] = joined.geometry.values.data joined["waterbody"] = joined.waterbody.values.data ids, segments = loop(joined.waterbody.values, joined.geometry.values, joined.index.values) # NOTE: this may have duplicate geometries where there are widely spaced drains on the same long waterbody edge df = gp.GeoDataFrame( { "drainID": ids, "geometry": segments, "width": pg.length(segments) }, crs=drains.crs, ).join(drains.drop(columns=["geometry"]), on="drainID") return df
def squareness(collection): """ Measures how different is a given shape from an equi-areal square The index is close to 0 for highly irregular shapes and to 1.3 for circular shapes. It equals 1 for squares. .. math:: \\begin{equation} \\frac{ \\sqrt{A}}{P^{2}} \\times \\frac{\\left(4 \\sqrt{\\left.A\\right)}^{2}\\right.}{\\sqrt{A}} = \\frac{\\left(4 \\sqrt{A}\\right)^{2}}{P{ }^{2}} = \\left(\\frac{4 \\sqrt{A}}{P}\\right)^{2} \\end{equation} where :math:`A` is the area and :math:`P` is the perimeter. Notes ----- Implementation follows :cite:`basaraner2017`. """ ga = _cast(collection) return ((numpy.sqrt(pygeos.area(ga)) * 4) / pygeos.length(ga))**2
def _morphological_tessellation(self, gdf, unique_id, limit, shrink, segment, verbose, check=True): objects = gdf if shrink != 0: print("Inward offset...") if verbose else None mask = objects.type.isin(["Polygon", "MultiPolygon"]) objects.loc[mask, objects.geometry.name] = objects[mask].buffer( -shrink, cap_style=2, join_style=2) objects = objects.reset_index(drop=True).explode() objects = objects.set_index(unique_id) print("Generating input point array...") if verbose else None points, ids = self._dense_point_array(objects.geometry.values.data, distance=segment, index=objects.index) hull = pygeos.convex_hull(limit) bounds = pygeos.bounds(hull) width = bounds[2] - bounds[0] leng = bounds[3] - bounds[1] hull = pygeos.buffer(hull, 2 * width if width > leng else 2 * leng) hull_p, hull_ix = self._dense_point_array( [hull], distance=pygeos.length(hull) / 100, index=[0]) points = np.append(points, hull_p, axis=0) ids = ids + ([-1] * len(hull_ix)) print("Generating Voronoi diagram...") if verbose else None voronoi_diagram = Voronoi(np.array(points)) print("Generating GeoDataFrame...") if verbose else None regions_gdf = self._regions(voronoi_diagram, unique_id, ids, crs=gdf.crs) print("Dissolving Voronoi polygons...") if verbose else None morphological_tessellation = regions_gdf[[unique_id, "geometry" ]].dissolve(by=unique_id, as_index=False) morphological_tessellation = gpd.clip( morphological_tessellation, gpd.GeoSeries(limit, crs=gdf.crs)) if check: self._check_result(morphological_tessellation, gdf, unique_id=unique_id) return morphological_tessellation
def test_length(): actual = pygeos.length([ point, line_string, linear_ring, polygon, polygon_with_hole, multi_point, multi_polygon, ]) assert actual.tolist() == [0.0, 2.0, 4.0, 8.0, 48.0, 0.0, 4.4]
def equivalent_rectangular_index(collection): """ Deviation of a polygon from an equivalent rectangle .. math:: \\frac{\\sqrt{A}}{A_{MBR}} \\times \\frac{P_{MBR}}{P} where :math:`A` is the area, :math:`A_{MBR}` is the area of minimum bounding rotated rectangle, :math:`P` is the perimeter, :math:`P_{MBR}` is the perimeter of minimum bounding rotated rectangle. Notes ----- Implementation follows :cite:`basaraner2017`. """ ga = _cast(collection) box = pygeos.minimum_rotated_rectangle(ga) return numpy.sqrt(pygeos.area(ga) / pygeos.area(box)) * ( pygeos.length(box) / pygeos.length(ga))
def length(data): if compat.USE_PYGEOS: return pygeos.length(data) else: return _unary_op("length", data, null_value=np.nan)
def __init__(self, left, right, heights=None, distance=10, tick_length=50, verbose=True): self.left = left self.right = right self.distance = distance self.tick_length = tick_length pygeos_lines = left.geometry.values.data list_points = np.empty((0, 2)) ids = [] end_markers = [] lengths = pygeos.length(pygeos_lines) for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)): pts = pygeos.line_interpolate_point( line, np.linspace(0, length, num=int((length) // distance))) list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0) if len(pts) > 1: ids += [ix] * len(pts) * 2 markers = [True] + ([False] * (len(pts) - 2)) + [True] end_markers += markers elif len(pts) == 1: end_markers += [True] ids += [ix] * 2 ticks = [] for num, (pt, end) in enumerate(zip(list_points, end_markers), 1): if end: ticks.append([pt, pt]) ticks.append([pt, pt]) else: angle = self._getAngle(pt, list_points[num]) line_end_1 = self._getPoint1(pt, angle, tick_length / 2) angle = self._getAngle(line_end_1, pt) line_end_2 = self._getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) ticks = pygeos.linestrings(ticks) inp, res = right.sindex.query_bulk(ticks, predicate="intersects") intersections = pygeos.intersection(ticks[inp], right.geometry.values.data[res]) distances = pygeos.distance(intersections, pygeos.points(list_points[inp // 2])) inp_uni, inp_cts = np.unique(inp, return_counts=True) splitter = np.cumsum(inp_cts)[:-1] dist_per_res = np.split(distances, splitter) inp_per_res = np.split(res, splitter) min_distances = [] min_inds = [] for dis, ind in zip(dist_per_res, inp_per_res): min_distances.append(np.min(dis)) min_inds.append(ind[np.argmin(dis)]) dists = np.zeros((len(ticks), )) dists[:] = np.nan dists[inp_uni] = min_distances if heights is not None: if isinstance(heights, str): heights = self.heights = right[heights] elif not isinstance(heights, pd.Series): heights = self.heights = pd.Series(heights) blgs = np.zeros((len(ticks), )) blgs[:] = None blgs[inp_uni] = min_inds do_heights = True else: do_heights = False ids = np.array(ids) widths = [] openness = [] deviations = [] heights_list = [] heights_deviations_list = [] for i in range(len(left)): f = ids == i s = dists[f] lefts = s[::2] rights = s[1::2] left_mean = np.nanmean( lefts) if ~np.isnan(lefts).all() else tick_length / 2 right_mean = (np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2) widths.append(np.mean([left_mean, right_mean]) * 2) openness.append(np.isnan(s).sum() / (f).sum()) deviations.append(np.nanstd(s)) if do_heights: b = blgs[f] h = heights.iloc[b[~np.isnan(b)]] heights_list.append(h.mean()) heights_deviations_list.append(h.std()) self.w = pd.Series(widths, index=left.index) self.wd = pd.Series(deviations, index=left.index).fillna( 0) # fill for empty intersections self.o = pd.Series(openness, index=left.index).fillna(1) if do_heights: self.h = pd.Series(heights_list, index=left.index).fillna( 0) # fill for empty intersections self.hd = pd.Series(heights_deviations_list, index=left.index).fillna( 0) # fill for empty intersections self.p = self.h / self.w.replace(0, np.nan) # replace to avoid np.inf
def find_dam_face_from_waterbody(waterbody, drain_pt): total_area = pg.area(waterbody) ring = pg.get_exterior_ring(pg.normalize(waterbody)) total_length = pg.length(ring) num_pts = pg.get_num_points(ring) - 1 # drop closing coordinate vertices = pg.get_point(ring, range(num_pts)) ### Extract line segments that are no more than 1/3 coordinates of polygon # starting from the vertex nearest the drain # note: lower numbers are to the right tree = pg.STRtree(vertices) ix = tree.nearest(drain_pt)[1][0] side_width = min(num_pts // 3, MAX_SIDE_PTS) left_ix = ix + side_width right_ix = ix - side_width # extract these as a left-to-write line; pts = vertices[max(right_ix, 0):min(num_pts, left_ix)][::-1] if left_ix >= num_pts: pts = np.append(vertices[0:left_ix - num_pts][::-1], pts) if right_ix < 0: pts = np.append(pts, vertices[num_pts + right_ix:num_pts][::-1]) coords = pg.get_coordinates(pts) if len(coords) > 2: # first run a simplification process to extract the major shape and bends # then run the straight line algorithm simp_coords, simp_ix = simplify_vw( coords, min(MAX_SIMPLIFY_AREA, total_area / 100)) if len(simp_coords) > 2: keep_coords, ix = extract_straight_segments( simp_coords, max_angle=MAX_STRAIGHT_ANGLE, loops=5) keep_ix = simp_ix.take(ix) else: keep_coords = simp_coords keep_ix = simp_ix else: keep_coords = coords keep_ix = np.arange(len(coords)) ### Calculate the length of each run and drop any that are not sufficiently long lengths = segment_length(keep_coords) ix = (lengths >= MIN_DAM_WIDTH) & (lengths / total_length < MAX_WIDTH_RATIO) pairs = np.dstack([keep_ix[:-1][ix], keep_ix[1:][ix]])[0] # since ranges are ragged, we have to do this in a loop instead of vectorized segments = [] for start, end in pairs: segments.append(pg.linestrings(coords[start:end + 1])) segments = np.array(segments) # only keep the segments that are close to the drain segments = segments[ pg.intersects(segments, pg.buffer(drain_pt, MAX_DRAIN_DIST)), ] if not len(segments): return segments # only keep those where the drain is interior to the line pos = pg.line_locate_point(segments, drain_pt) lengths = pg.length(segments) ix = (pos >= MIN_INTERIOR_DIST) & (pos <= (lengths - MIN_INTERIOR_DIST)) return segments[ix]
def cut_flowlines_at_barriers(flowlines, joins, barriers, next_segment_id=None): """Cut flowlines by barriers. Parameters ---------- flowlines : GeoDataFrame ALL flowlines for region. barriers : GeoDataFrame Barriers that will be used to cut flowlines. joins : DataFrame Joins between flowlines (upstream, downstream pairs). next_segment_id : int, optional Used as starting point for IDs of new segments created by cutting flowlines. Returns ------- GeoDataFrame, DataFrame, DataFrame updated flowlines, updated joins, barrier joins (upstream / downstream flowline ID per barrier) """ start = time() print(f"Starting number of segments: {len(flowlines):,}") print(f"Cutting in {len(barriers):,} barriers") # Our segment ids are ints, so just increment from the last one we had from NHD if next_segment_id is None: next_segment_id = int(flowlines.index.max() + 1) # join barriers to lines and extract those that have segments (via inner join) segments = (flowlines[["lineID", "NHDPlusID", "geometry"]].rename(columns={ "geometry": "flowline" }).join( barriers[["geometry", "barrierID", "lineID"]].set_index("lineID").rename( columns={"geometry": "barrier"}), how="inner", )) # Calculate the position of each barrier on each segment. # Barriers are on upstream or downstream end of segment if they are within # SNAP_ENDPOINT_TOLERANCE of the ends. Otherwise, they are splits segments["linepos"] = pg.line_locate_point(segments.flowline.values.data, segments.barrier.values.data) ### Upstream and downstream endpoint barriers segments["on_upstream"] = segments.linepos <= SNAP_ENDPOINT_TOLERANCE segments["on_downstream"] = ( segments.linepos >= pg.length(segments.flowline.values.data) - SNAP_ENDPOINT_TOLERANCE) # if line length is < SNAP_ENDPOINT_TOLERANCE, then barrier could be tagged # to both sides, which is incorrect. Default to on_downstream. segments.loc[segments.on_upstream & segments.on_downstream, "on_upstream"] = False print( f"{segments.on_upstream.sum():,} barriers on upstream point of their segments" ) print( f"{segments.on_downstream.sum():,} barriers on downstream point of their segments" ) # Barriers on upstream endpoint: # their upstream_id is the upstream_id(s) of their segment from joins, # and their downstream_is is the segment they are on. # NOTE: a barrier may have multiple upstreams if it occurs at a fork in the network. # All terminal upstreams should be already coded as 0 in joins, but just in case # we assign N/A to 0. upstream_barrier_joins = ((segments.loc[segments.on_upstream][[ "barrierID", "lineID" ]].rename(columns={ "lineID": "downstream_id" }).join(joins.set_index("downstream_id").upstream_id, on="downstream_id")).fillna(0).astype("uint64")) # Barriers on downstream endpoint: # their upstream_id is the segment they are on and their downstream_id is the # downstream_id of their segment from the joins. # Some downstream_ids may be missing if the barrier is on the downstream-most point of the # network (downstream terminal) and further downstream segments were removed due to removing # coastline segments. downstream_barrier_joins = ((segments.loc[segments.on_downstream][[ "barrierID", "lineID" ]].rename(columns={ "lineID": "upstream_id" }).join(joins.set_index("upstream_id").downstream_id, on="upstream_id")).fillna(0).astype("uint64")) barrier_joins = upstream_barrier_joins.append(downstream_barrier_joins, ignore_index=True, sort=False).set_index( "barrierID", drop=False) ### Split segments have barriers that are not at endpoints split_segments = segments.loc[~(segments.on_upstream | segments.on_downstream)] # join in count of barriers that SPLIT this segment split_segments = split_segments.join( split_segments.groupby(level=0).size().rename("barriers")) print( f"{(split_segments.barriers == 1).sum():,} segments to cut have one barrier" ) print( f"{(split_segments.barriers > 1).sum():,} segments to cut have more than one barrier" ) # ordinate the barriers by their projected distance on the line # Order this so we are always moving from upstream end to downstream end split_segments = split_segments.rename_axis("idx").sort_values( by=["idx", "linepos"], ascending=True) # Convert to DataFrame so that geometry cols are arrays of pygeos geometries tmp = pd.DataFrame(split_segments.copy()) tmp.flowline = tmp.flowline.values.data tmp.barrier = tmp.barrier.values.data tmp["pos"] = pg.line_locate_point(tmp.flowline.values, tmp.barrier.values) # Group barriers by line so that we can split geometries in one pass grouped = ( tmp[[ "lineID", "NHDPlusID", "barrierID", "barriers", "flowline", "barrier", "pos", ]].sort_values(by=["lineID", "pos"]).groupby("lineID").agg({ "lineID": "first", "NHDPlusID": "first", "flowline": "first", "barrierID": list, "barriers": "first", # "barrier": list, # TODO: remove "pos": list, })) # cut line for all barriers outer_ix, inner_ix, lines = cut_lines_at_points( grouped.flowline.apply(lambda x: pg.get_coordinates(x)).values, grouped.pos.apply(np.array).values, ) lines = np.asarray(lines) new_flowlines = gp.GeoDataFrame({ "lineID": (next_segment_id + np.arange(len(outer_ix))).astype("uint32"), "origLineID": grouped.index.take(outer_ix), "position": inner_ix, "geometry": lines, "length": pg.length(lines).astype("float32"), "sinuosity": calculate_sinuosity(lines).astype("float32"), }).join( flowlines.drop( columns=[ "geometry", "lineID", "xmin", "ymin", "xmax", "ymax", "length", "sinuosity", ], errors="ignore", ), on="origLineID", ) # transform new segments to create new joins l = new_flowlines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created updated_joins = update_joins(joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id") # also need to update any barrier joins already created for those on endpoints barrier_joins = update_joins( barrier_joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) # For all new interior joins, create upstream & downstream ids per original line upstream_side = (new_flowlines.loc[~new_flowlines.lineID.isin(last)][[ "origLineID", "position", "lineID" ]].set_index(["origLineID", "position"]).rename(columns={"lineID": "upstream_id"})) downstream_side = new_flowlines.loc[~new_flowlines.lineID.isin(first)][[ "origLineID", "position", "lineID" ]].rename(columns={"lineID": "downstream_id"}) downstream_side.position = downstream_side.position - 1 downstream_side = downstream_side.set_index(["origLineID", "position"]) new_joins = (grouped.barrierID.apply( pd.Series).stack().astype("uint32").reset_index().rename(columns={ "lineID": "origLineID", "level_1": "position", 0: "barrierID" }).set_index([ "origLineID", "position" ]).join(upstream_side).join(downstream_side).reset_index().join( grouped.NHDPlusID.rename("upstream"), on="origLineID")) new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" new_joins["marine"] = False updated_joins = updated_joins.append( new_joins[[ "upstream", "downstream", "upstream_id", "downstream_id", "type", "marine" ]], ignore_index=True, sort=False, ).sort_values(["downstream_id", "upstream_id"]) barrier_joins = (barrier_joins.append( new_joins[["barrierID", "upstream_id", "downstream_id"]], ignore_index=True, sort=False, ).set_index("barrierID", drop=False).astype("uint32")) # any join that is upstream of a barrier cannot be marine updated_joins.loc[ updated_joins.marine & updated_joins.upstream_id.isin(barrier_joins.upstream_id.unique()), "marine", ] = False # extract flowlines that are not split by barriers and merge in new flowlines unsplit_segments = flowlines.loc[~flowlines.index.isin(split_segments.index )] updated_flowlines = unsplit_segments.append( new_flowlines.drop(columns=["origLineID", "position"]), ignore_index=True, sort=False, ).set_index("lineID", drop=False) print(f"Done cutting flowlines in {time() - start:.2f}s") return updated_flowlines, updated_joins, barrier_joins
def _morphological_tessellation(self, gdf, unique_id, limit, shrink, segment, verbose, check=True): objects = gdf.copy() if isinstance(limit, (gpd.GeoSeries, gpd.GeoDataFrame)): limit = limit.unary_union if isinstance(limit, BaseGeometry): limit = pygeos.from_shapely(limit) bounds = pygeos.bounds(limit) centre_x = (bounds[0] + bounds[2]) / 2 centre_y = (bounds[1] + bounds[3]) / 2 objects["geometry"] = objects["geometry"].translate(xoff=-centre_x, yoff=-centre_y) if shrink != 0: print("Inward offset...") if verbose else None mask = objects.type.isin(["Polygon", "MultiPolygon"]) objects.loc[mask, "geometry"] = objects[mask].buffer(-shrink, cap_style=2, join_style=2) objects = objects.reset_index(drop=True).explode() objects = objects.set_index(unique_id) print("Generating input point array...") if verbose else None points, ids = self._dense_point_array(objects.geometry.values.data, distance=segment, index=objects.index) # add convex hull buffered large distance to eliminate infinity issues series = gpd.GeoSeries(limit, crs=gdf.crs).translate(xoff=-centre_x, yoff=-centre_y) width = bounds[2] - bounds[0] leng = bounds[3] - bounds[1] hull = series.geometry[[0]].buffer(2 * width if width > leng else 2 * leng) # pygeos bug fix if (hull.type == "MultiPolygon").any(): hull = hull.explode() hull_p, hull_ix = self._dense_point_array( hull.values.data, distance=pygeos.length(limit) / 100, index=hull.index) points = np.append(points, hull_p, axis=0) ids = ids + ([-1] * len(hull_ix)) print("Generating Voronoi diagram...") if verbose else None voronoi_diagram = Voronoi(np.array(points)) print("Generating GeoDataFrame...") if verbose else None regions_gdf = self._regions(voronoi_diagram, unique_id, ids, crs=gdf.crs) print("Dissolving Voronoi polygons...") if verbose else None morphological_tessellation = regions_gdf[[unique_id, "geometry" ]].dissolve(by=unique_id, as_index=False) morphological_tessellation = gpd.clip(morphological_tessellation, series) morphological_tessellation["geometry"] = morphological_tessellation[ "geometry"].translate(xoff=centre_x, yoff=centre_y) if check: self._check_result(morphological_tessellation, gdf, unique_id=unique_id) return morphological_tessellation
def snap_to_flowlines(df, to_snap): """Snap to nearest flowline, within tolerance Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. If dams are within SNAP_ENDPOINT_TOLERANCE of the endpoints of the line, they will be snapped to the endpoint instead of closest point on line. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ print("=================\nSnapping to flowlines...") for huc2 in sorted(to_snap.HUC2.unique()): region_start = time() print(f"\n----- {huc2} ------") in_huc2 = to_snap.loc[to_snap.HUC2 == huc2].copy() flowlines = gp.read_feather( nhd_dir / "clean" / huc2 / "flowlines.feather", columns=["geometry", "lineID"], ).set_index("lineID") print( f"HUC {huc2} selected {len(in_huc2):,} barriers in region to snap against {len(flowlines):,} flowlines" ) lines = nearest( pd.Series(in_huc2.geometry.values.data, index=in_huc2.index), pd.Series(flowlines.geometry.values.data, index=flowlines.index), in_huc2.snap_tolerance.values, ) lines = lines.join(in_huc2.geometry).join( flowlines.geometry.rename("line"), on="lineID", ) # project the point to the line, # find out its distance on the line, lines["line_pos"] = pg.line_locate_point(lines.line.values.data, lines.geometry.values.data) # if within tolerance of start point, snap to start ix = lines["line_pos"] <= SNAP_ENDPOINT_TOLERANCE lines.loc[ix, "line_pos"] = 0 # if within tolerance of endpoint, snap to end end = pg.length(lines.line.values.data) ix = lines["line_pos"] >= end - SNAP_ENDPOINT_TOLERANCE lines.loc[ix, "line_pos"] = end[ix] # then interpolate its new coordinates lines["geometry"] = pg.line_interpolate_point(lines.line.values.data, lines["line_pos"]) ix = lines.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = lines.geometry df.loc[ix, "snap_dist"] = lines.distance df.loc[ix, "snap_ref_id"] = lines.lineID df.loc[ix, "lineID"] = lines.lineID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of flowline", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print("{:,} barriers snapped in region in {:.2f}s".format( len(ix), time() - region_start)) # TODO: flag those that joined to loops return df, to_snap
def remove_marine_flowlines(flowlines, joins, marine): """Remove flowlines that originate within or are mostly within marine areas for coastal HUC2s. Marks any that have endpoints in marine areas or are upstream of those removed here as terminating in marine. Parameters ---------- flowlines : GeoDataFrame joins : DataFrame marine : GeoDataFrame Returns ------- (GeoDataFrame, DataFrame) flowlines, joins """ # Remove those that start in marine areas points = pg.get_point(flowlines.geometry.values.data, 0) tree = pg.STRtree(points) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) print(f"Removing {len(ix):,} flowlines that originate in marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") # Mark those that end in marine areas as marine endpoints = pg.get_point(flowlines.geometry.values.data, -1) tree = pg.STRtree(endpoints) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) joins.loc[joins.upstream_id.isin(ix), "marine"] = True # For any that end in marine but didn't originate there, check the amount of overlap; # any that are >= 90% in marine should get cut print("Calculating overlap of remaining lines with marine areas") tmp = pd.DataFrame({ "lineID": flowlines.iloc[right].index, "geometry": flowlines.iloc[right].geometry.values.data, "marine": marine.iloc[left].geometry.values.data, }) tmp["overlap"] = pg.intersection(tmp.geometry, tmp.marine) tmp["pct_overlap"] = 100 * pg.length(tmp.overlap) / pg.length(tmp.geometry) ix = tmp.loc[tmp.pct_overlap >= 90].lineID.unique() print(f"Removing {len(ix):,} flowlines that mostly overlap marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") return flowlines, joins
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, next_lineID): """ Cut lines by waterbodies. 1. Finds all intersections between waterbodies and flowlines. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from wb_joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame next_lineID : int next lineID; must be greater than all prior lines in region Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() ### Find flowlines that intersect waterbodies join_start = time() tree = pg.STRtree(flowlines.geometry.values.data) left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": flowlines.index.take(right), "flowline": flowlines.geometry.values.data.take(right), "wbID": waterbodies.index.take(left), "waterbody": waterbodies.geometry.values.data.take(left), }) print( f"Found {len(df):,} waterbody / flowline joins in {time() - join_start:.2f}s" ) ### Find those that are completely contained; these don't need further processing pg.prepare(df.waterbody.values) # find those that are fully contained and do not touch the edge of the waterbody (contains_properly predicate) # contains_properly is very fast contained_start = time() df["contains"] = pg.contains_properly(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} flowlines fully within waterbodies in {time() - contained_start:.2f}s" ) # find those that aren't fully contained by contained and touch the edge of waterbody (contains predicate) contained_start = time() ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "contains"] = pg.contains(tmp.waterbody, tmp.flowline) print( f"Identified {df.loc[ix].contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # Sanity check: flowlines should only ever be contained by one waterbody if df.loc[df.contains].groupby("lineID").size().max() > 1: raise ValueError( "ERROR: one or more lines contained by multiple waterbodies") # for any that are not completely contained, find the ones that overlap crosses_start = time() df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) print( f"Identified {df.crosses.sum():,} flowlines that cross edge of waterbodies in {time() - crosses_start:.2f}s" ) # discard any that only touch (ones that don't cross or are contained) # note that we only cut the ones that cross below; contained ones are left intact df = df.loc[df.contains | df.crosses].copy() print("Intersecting flowlines and waterbodies...") cut_start = time() ix = df.crosses tmp = df.loc[ix] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # Cut lines that are long enough and different enough from the original lines df["to_cut"] = False tmp = df.loc[df.crosses] keep = (tmp.crosses & (tmp.length >= CUT_TOLERANCE) & ((tmp.flength - tmp.length).abs() >= CUT_TOLERANCE)) df.loc[keep[keep].index, "to_cut"] = True df["inside"] = (df.length / df.flength).clip(0, 1) print( f"Found {df.to_cut.sum():,} segments that need to be cut by flowlines in {time() - cut_start:.2f}s" ) # save all that are completely contained or mostly contained. # They must be at least 50% in waterbody to be considered mostly contained. # Note: there are some that are mostly outside and we exclude those here. # We then update this after cutting contained = df.loc[df.inside >= 0.5, ["wbID", "lineID"]].copy() ### Cut lines if df.to_cut.sum(): # only work with those to cut from here on out df = df.loc[df.to_cut, ["lineID", "flowline", "wbID", "waterbody"]].reset_index( drop=True) # save waterbody ids to re-evaluate intersection after cutting wbID = df.wbID.unique() # extract all intersecting interior rings for these waterbodies print("Extracting interior rings for intersected waterbodies") wb = waterbodies.loc[waterbodies.index.isin(wbID)] outer_index, inner_index, rings = get_interior_rings( wb.geometry.values.data) if len(outer_index): # find the pairs of waterbody rings and lines to add rings = np.asarray(rings) wb_with_rings = wb.index.values.take(outer_index) lines_in_wb = df.loc[df.wbID.isin(wb_with_rings)].lineID.unique() lines_in_wb = flowlines.loc[flowlines.index.isin( lines_in_wb)].geometry tree = pg.STRtree(rings) left, right = tree.query_bulk(lines_in_wb.values.data, predicate="intersects") tmp = pd.DataFrame({ "lineID": lines_in_wb.index.values.take(left), "flowline": lines_in_wb.values.data.take(left), "wbID": wb_with_rings.take(right), "waterbody": rings.take(right), }) df = df.append(tmp, ignore_index=True, sort=False) # extract the outer ring for original waterbodies ix = pg.get_type_id(df.waterbody.values.data) == 3 df.loc[ix, "waterbody"] = pg.get_exterior_ring( df.loc[ix].waterbody.values.data) # Calculate all geometric intersections between the flowlines and # waterbody rings and drop any that are not points # Note: these may be multipoints where line crosses the ring of waterbody # multiple times. # We ignore any shared edges, etc that result from the intersection; those # aren't helpful for cutting the lines print("Finding cut points...") df["geometry"] = pg.intersection(df.flowline.values, df.waterbody.values) df = explode( explode( gp.GeoDataFrame(df[["geometry", "lineID", "flowline"]], crs=flowlines.crs))).reset_index() points = (df.loc[pg.get_type_id(df.geometry.values.data) == 0].set_index("lineID").geometry) print("cutting flowlines") cut_start = time() flowlines, joins = cut_flowlines_at_points(flowlines, joins, points, next_lineID=next_lineID) new_flowlines = flowlines.loc[flowlines.new] print( f"{len(new_flowlines):,} new flowlines created in {time() - cut_start:,.2f}s" ) if len(new_flowlines): # remove any flowlines no longer present (they were replaced by cut lines) contained = contained.loc[contained.lineID.isin( flowlines.loc[~flowlines.new].index.unique())].copy() contained_start = time() # recalculate overlaps with waterbodies print("Recalculating overlaps with waterbodies") wb = waterbodies.loc[wbID] tree = pg.STRtree(new_flowlines.geometry.values.data) left, right = tree.query_bulk(wb.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": new_flowlines.index.take(right), "flowline": new_flowlines.geometry.values.data.take(right), "wbID": wb.index.take(left), "waterbody": wb.geometry.values.data.take(left), }) pg.prepare(df.waterbody.values) df["contains"] = pg.contains(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # some aren't perfectly contained, add those that are mostly in df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) # discard any that only touch (don't cross or are contained) df = df.loc[df.contains | df.crosses].copy() tmp = df.loc[df.crosses] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # keep any that are contained or >= 50% in waterbody contained = contained.append( df.loc[df.contains | ((df.length / df.flength) >= 0.5), ["wbID", "lineID"]], ignore_index=True, ) flowlines = flowlines.drop(columns=["new"]) # make sure that updated joins are unique joins = joins.drop_duplicates() # make sure that wb_joins is unique contained = contained.groupby(by=["lineID", "wbID"]).first().reset_index() # set flag for flowlines in waterbodies flowlines["waterbody"] = flowlines.index.isin(contained.lineID.unique()) print("Done evaluating waterbody / flowline overlap in {:.2f}s".format( time() - start)) return flowlines, joins, contained
def cut_flowlines_at_points(flowlines, joins, points, next_lineID): """General method for cutting flowlines at points and updating joins. Only points >= SNAP_ENDPOINT_TOLERANCE are used to cut lines. Lines are cut starting at the upstream end; the original ordering of points per line is not preserved. Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins points : GeoSeries points to cut flowlines, must be indexed to join against flowlines; one record per singular Point. next_lineID : int id of next flowline to be created Returns ------- (GeoDataFrame, DataFrame) Updated flowlines and joins. Note: flowlines have a "new" column to identify new flowlines created here. """ df = flowlines.join(points.rename("point"), how="inner") df["pos"] = pg.line_locate_point(df.geometry.values.data, df.point.values.data) # only keep cut points that are sufficiently interior to the line # (i.e., not too close to endpoints) ix = (df.pos >= SNAP_ENDPOINT_TOLERANCE) & ( (df["length"] - df.pos).abs() >= SNAP_ENDPOINT_TOLERANCE) # sort remaining cut points in ascending order on their lines df = df.loc[ix].sort_values(by=["lineID", "pos"]) # convert to plain DataFrame so that we can extract coords grouped = pd.DataFrame( df.groupby("lineID").agg({ "geometry": "first", "pos": list })) grouped["geometry"] = grouped.geometry.values.data outer_ix, inner_ix, lines = cut_lines_at_points( grouped.geometry.apply(lambda x: pg.get_coordinates(x)).values, grouped.pos.apply(np.array).values, ) lines = np.asarray(lines) new_flowlines = gp.GeoDataFrame({ "lineID": (next_lineID + np.arange(len(outer_ix))).astype("uint32"), "origLineID": grouped.index.take(outer_ix), "geometry": lines, "length": pg.length(lines).astype("float32"), "sinuosity": calculate_sinuosity(lines).astype("float32"), }).join( flowlines.drop( columns=[ "geometry", "lineID", "xmin", "ymin", "xmax", "ymax", "length", "sinuosity", ], errors="ignore", ), on="origLineID", ) ### Update flowline joins # transform new lines to create new joins at the upstream / downstream most # points of the original line l = new_flowlines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created joins = update_joins( joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) ### Create new line joins for any that weren't inserted above # Transform all groups of new line IDs per original lineID # into joins structure atts = (new_flowlines.groupby("origLineID")[[ "NHDPlusID", "loop", "HUC4" ]].first().rename(columns={"NHDPlusID": "upstream"})) # function to make upstream / downstream side of join pairs = lambda a: pd.Series(zip(a[:-1], a[1:])) new_joins = (l.apply(pairs).apply( pd.Series).reset_index().rename(columns={ 0: "upstream_id", 1: "downstream_id" }).join(atts, on="origLineID")) # NHDPlusID is same for both sides new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" # new joins do not terminate in marine, so marine should always be false new_joins["marine"] = False new_joins = new_joins[[ "upstream", "downstream", "upstream_id", "downstream_id", "type", "loop", "marine", "HUC4", ]] joins = (joins.append(new_joins, ignore_index=True, sort=False).sort_values([ "downstream", "upstream", "downstream_id", "upstream_id" ]).reset_index(drop=True)) remove_ids = new_flowlines.origLineID.unique() flowlines["new"] = False new_flowlines["new"] = True flowlines = ( flowlines.loc[~flowlines.index.isin(remove_ids)].reset_index().append( new_flowlines.drop(columns=["origLineID"]), ignore_index=True, sort=False).set_index("lineID")) return flowlines, joins
layer=network_layer, as_pygeos=True, columns=[NET_COLS]) src_crs = networks.crs networks = networks.rename(columns={ "batNetID": "networkID", "StreamOrde": "streamorder" }).set_index("networkID") # convert to LineStrings networks.geometry = pg.get_geometry(networks.geometry, 0) # project to crs networks.geometry = to_crs(networks.geometry, src_crs, CRS) networks["length"] = pg.length(networks.geometry) networks["miles"] = networks.length * 0.000621371 # sinuosity of each segment networks["sinuosity"] = calculate_sinuosity(networks.geometry) # aggregate up to the network network_length = networks.groupby(level=0)[["length"]].sum() temp_df = networks[["length", "sinuosity"]].join(network_length, rsuffix="_total") # Calculate length-weighted sinuosity wtd_sinuosity = ((temp_df.sinuosity * (temp_df.length / temp_df.length_total)).groupby( level=0).sum().rename("sinuosity")) # convert to miles
def test_length_missing(): actual = pygeos.length(None) assert np.isnan(actual)
def test_length_empty(): actual = pygeos.length(Empty) assert np.isnan(actual)
def street_profile(streets, buildings, distance=3, tick_length=50): pygeos_lines = streets.geometry.values.data list_points = np.empty((0, 2)) ids = [] lengths = pygeos.length(pygeos_lines) for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)): pts = pygeos.line_interpolate_point( line, np.linspace(0, length, num=int((length) // distance)) ) # .1 offset to keep a gap between two segments list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0) ids += [ix] * len(pts) * 2 ticks = [] for num, pt in enumerate(list_points, 1): # start chainage 0 if num == 1: angle = _getAngle(pt, list_points[num]) line_end_1 = _getPoint1(pt, angle, tick_length / 2) angle = _getAngle(line_end_1, pt) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) # everything in between if num < len(list_points) - 1: angle = _getAngle(pt, list_points[num]) line_end_1 = _getPoint1( list_points[num], angle, tick_length / 2 ) angle = _getAngle(line_end_1, list_points[num]) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, list_points[num]]) ticks.append([line_end_2, list_points[num]]) # end chainage if num == len(list_points): angle = _getAngle(list_points[num - 2], pt) line_end_1 = _getPoint1(pt, angle, tick_length / 2) angle = _getAngle(line_end_1, pt) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) ticks = pygeos.linestrings(ticks) inp, res = pygeos.STRtree(ticks).query_bulk(buildings.geometry.values.data, predicate='intersects') intersections = pygeos.intersection(ticks[res], buildings.geometry.values.data[inp]) distances = pygeos.distance(intersections, pygeos.points(list_points[res // 2])) dists = np.zeros((len(ticks),)) dists[:] = np.nan dists[res] = distances ids = np.array(ids) widths = [] openness = [] deviations = [] for i in range(len(streets)): f = ids == i s = dists[f] lefts = s[::2] rights = s[1::2] left_mean = np.nanmean(lefts) if ~np.isnan(lefts).all() else tick_length / 2 right_mean = np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2 widths.append(np.mean([left_mean, right_mean]) * 2) openness.append(np.isnan(s).sum() / (f).sum()) deviations.append(np.nanstd(s)) return (widths, deviations, openness)
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, wb_joins, out_dir): """ Cut lines by waterbodies. 1. Intersects all previously intersected flowlines with waterbodies. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame wb_joins : DataFrame waterbody flowline joins outdir : pathlib.Path output directory for writing error files, if needed Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() fl_geom = flowlines.loc[flowlines.index.isin(wb_joins.lineID), ["geometry"]].copy() # Many waterbodies have interior polygons (islands); these break the analysis below for cutting lines # Extract a new polygon of just their outer boundary wb_geom = waterbodies[["geometry"]].copy() wb_geom["waterbody"] = pg.polygons(pg.get_exterior_ring(wb_geom.geometry)) print("Validating waterbodies...") ix = ~pg.is_valid(wb_geom.waterbody) invalid_count = ix.sum() if invalid_count: print("{:,} invalid waterbodies found, repairing...".format(invalid_count)) # Buffer by 0 to fix # TODO: may need to do this by a small fraction and simplify instead repair_start = time() wb_geom.loc[ix, "waterbody"] = pg.buffer(wb_geom.loc[ix].waterbody, 0) waterbodies.loc[ix, "geometry"] = wb_geom.loc[ix].waterbody print("Repaired geometry in {:.2f}s".format(time() - repair_start)) # Set indices and create combined geometry object for analysis wb_joins = wb_joins.set_index(["lineID", "wbID"]) geoms = wb_joins.join(fl_geom, how="inner").join(wb_geom.waterbody) ### Find contained geometries print( "Identifying flowlines completely within waterbodies out of {:,} flowline / waterbody combinations...".format( len(geoms) ) ) contained_start = time() geoms["inside"] = pg.contains(geoms.waterbody.values, geoms.geometry.values) print( "Identified {:,} flowlines completely contained by waterbodies in {:.2f}s".format( geoms.inside.sum(), time() - contained_start ) ) # Check for logic errors - no flowline should be completely contained by more than 1 waterbody errors = geoms.groupby(level=[0]).inside.sum().astype("uint8") > 1 if errors.max(): # this most likely indicates duplicate waterbodies, which should have been resolved before this print( "ERROR: major logic error - some flowlines claim to be completely contained by multiple waterbodies" ) print( "===> error flowlines written to {}/contained_errors.feather".format( out_dir ) ) to_geofeather( flowlines.loc[flowlines.index.isin(errors)], out_dir / "contained_errors.feather", crs=CRS, ) ### Check those that aren't contained to see if they cross print("Determining which flowlines actually cross into waterbodies...") cross_start = time() geoms = geoms.loc[~geoms.inside].copy() geoms["crosses"] = pg.crosses(geoms.geometry, geoms.waterbody) outside = geoms.loc[~(geoms["crosses"] | geoms.inside)].index # keep the ones that cross for further processing geoms = geoms.loc[geoms.crosses].copy() print( "Identified {:,} flowlines completely outside waterbodies and {:,} flowlines that cross waterbody boundaries in {:.2f}s".format( len(outside), len(geoms), time() - cross_start ) ) # Any that do not cross and are not completely within waterbodies should be dropped now # Can only drop joins by BOTH lineID and wbID (the index here) # Also drop associated waterbodies that no longer have joins wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() # FIXME: for closely adjacent waterbodies, these are important to keep # Need to cut them by their multiple polys, update their joins, and feed back into following analysis # pg.intersection_all might work here # check for multiple crossings - these are errors from NHD that we can drop from here errors = geoms.groupby(level=0).size() > 1 if errors.max(): print( "Found {:,} flowlines that cross multiple waterbodies. These are bad data and will be dropped from waterbody intersection.".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_multiple.feather", crs=CRS, ) # completely remove the flowlines from intersections and drop the waterbodies wb_joins = wb_joins.loc[ ~wb_joins.index.get_level_values(0).isin(errors.loc[errors].index) ].copy() waterbodies = waterbodies.loc[ waterbodies.index.isin(wb_joins.index.get_level_values(1)) ].copy() geoms = geoms.loc[geoms.index.isin(wb_joins.index)].copy() print("Calculating geometric intersection of flowlines and waterbodies...") int_start = time() geoms = geoms[["geometry", "waterbody"]].join(flowlines.length.rename("origLength")) # First, calculate the geometric intersection between the lines and waterbodies # WARNING: this intersection may return LineString, MultiLineString, Point, GeometryCollection geoms["intersection"] = pg.intersection(geoms.geometry, geoms.waterbody) types = pg.get_type_id(geoms.intersection) # NOTE: all the points should be captured by the above logic for crosses is_point = types.isin([0, 4]) is_line = types.isin([1, 5]) others = types[~(is_point | is_line)].unique() # GeometryCollection indicates a mess, skip those if len(others): print( "WARNING: Found other types of geometric intersection: {} (n={:,}), these will be dropped".format( others, len(types[~(is_point | is_line)]) ) ) # Any that intersect only at a point are OUTSIDE outside = geoms.loc[is_point].index # TODO: confirm this works wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() print("Identified {:,} more flowlines outside waterbodies".format(len(outside))) # Drop those that are not lines from further analysis geoms = geoms.loc[is_line].copy() # Inspect amount of overlay - if the intersected length is within 1m of final length, it is completely within # if it is near 0, it is completely outside geoms["length"] = pg.length(geoms.intersection) outside = geoms.length < 1 inside = (geoms.origLength - geoms.length).abs() < 1 print( "Found {:,} more completely outside, {:,} completely inside".format( outside.sum(), inside.sum() ) ) # drop the ones that are outside wb_joins = wb_joins.loc[~wb_joins.index.isin(outside[outside].index)].copy() # cut the ones that aren't completely inside or outside geoms = geoms.loc[~(inside | outside)].copy() print("Done evaluating intersection in {:.2f}s".format(time() - int_start)) if len(geoms): print("Cutting {:,} flowlines ...".format(len(geoms))) cut_start = time() geoms = geoms[["geometry", "waterbody", "origLength"]] # WARNING: difference is not precise, the point of split is not exactly at the intersection between lines # but within some tolerance. This will cause them to fail the contains() test below. boundary = pg.boundary(geoms.waterbody) geoms["geometry"] = pg.difference(geoms.geometry, boundary) errors = ~pg.is_valid(geoms.geometry) if errors.max(): print("WARNING: geometry errors for {:,} cut lines".format(errors.sum())) length = pg.length(geoms.geometry) errors = (length - geoms.origLength).abs() > 1 if errors.max(): print( "WARNING: {:,} lines were not completely cut by waterbodies (maybe shared edge?).\nThese will not be cut".format( errors.sum() ) ) to_geofeather( flowlines.loc[ errors.loc[errors].index.get_level_values(0).unique() ].reset_index(), out_dir / "error_incomplete_cut.feather", crs=CRS, ) # remove these from the cut geoms and retain their originals geoms = geoms.loc[~errors].copy() # Explode the multilines into single line segments geoms["geometry"] = explode(geoms.geometry) geoms = geoms.explode("geometry") # mark those parts of the cut lines that are within waterbodies # WARNING: this is not capturing all that should be inside after cutting! geoms["iswithin"] = pg.contains(geoms.waterbody, geoms.geometry) errors = geoms.groupby(level=0).iswithin.max() == False if errors.max(): print( "WARNING: {:,} flowlines that cross waterbodies had no parts contained within those waterbodies".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_but_not_contained.feather", crs=CRS, ) # If they cross, assume they are within print("Attempting to correct these based on which ones cross") ix = geoms.loc[ geoms.index.get_level_values(0).isin(errors.loc[errors].index) ].index geoms.loc[ix, "iswithin"] = pg.crosses( geoms.loc[ix].geometry, geoms.loc[ix].waterbody ) errors = geoms.groupby(level=0).iswithin.max() == False print("{:,} still have no part in a waterbody".format(errors.sum())) # calculate total length of within and outside parts geoms["length"] = pg.length(geoms.geometry) # drop any new segments that are < 1m, these are noise print("Dropping {:,} new segments < 1m".format((geoms.length < 1).sum())) geoms = geoms.loc[geoms.length >= 1].copy() if len(geoms) > 1: length = geoms.groupby(["lineID", "wbID", "iswithin"]).agg( {"length": "sum", "origLength": "first"} ) # Anything within 1 meter of original length is considered unchanged # This is so that we ignore slivers length["unchanged"] = (length.origLength - length["length"]).abs() < 1 unchanged = ( length[["unchanged"]] .reset_index() .groupby(["lineID", "wbID"]) .unchanged.max() .rename("max_unchanged") ) unchanged = ( length.reset_index().set_index(["lineID", "wbID"]).join(unchanged) ) is_within = ( unchanged.loc[unchanged.max_unchanged] .reset_index() .set_index(["lineID", "wbID"]) .iswithin ) # For any that are unchanged and NOT within waterbodies, # remove them from wb_joins ix = is_within.loc[~is_within].index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # Remove any that are unchanged from intersection analysis geoms = geoms.loc[~geoms.index.isin(is_within.index)].copy() print( "Created {:,} new flowlines by splitting {:,} flowlines at waterbody edges in {:.2f}".format( len(geoms), len(geoms.index.get_level_values(0).unique()), time() - cut_start, ) ) if len(geoms) > 1: ### These are our final new lines to add # remove their lineIDs from flowlines and append # replace their outer joins to these ones and add intermediates # Join in previous line information from flowlines new_lines = ( geoms[["geometry", "length", "iswithin"]] .reset_index() .set_index("lineID") .join(flowlines.drop(columns=["geometry", "length", "sinuosity"])) .reset_index() .rename(columns={"lineID": "origLineID", "iswithin": "waterbody"}) ) error = ( new_lines.groupby("origLineID").wbID.unique().apply(len).max() > 1 ) if error: # Watch for errors - if a flowline is cut by multiple waterbodies # there will be problems with our logic for splicing in new lines # also - our intersection logic above is wrong print( """\n========\n MAJOR LOGIC ERROR: multiple waterbodies associated with a single flowline that as been cut. \n========\n """ ) # recalculate length and sinuosity new_lines["length"] = pg.length(new_lines.geometry).astype("float32") new_lines["sinuosity"] = calculate_sinuosity(new_lines.geometry).astype( "float32" ) # calculate new IDS next_segment_id = int(flowlines.index.max() + 1) new_lines["lineID"] = next_segment_id + new_lines.index new_lines.lineID = new_lines.lineID.astype("uint32") ### Update waterbody joins # remove joins replaced by above ix = new_lines.set_index(["origLineID", "wbID"]).index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # add new joins wb_joins = ( wb_joins.reset_index() .append( new_lines.loc[new_lines.waterbody, ["lineID", "wbID"]], ignore_index=True, sort=False, ) .set_index(["lineID", "wbID"]) ) ### Update flowline joins # transform new lines to create new joins l = new_lines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created joins = update_joins( joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) ### Create new line joins for any that weren't inserted above # Transform all groups of new line IDs per original lineID, wbID # into joins structure pairs = lambda a: pd.Series(zip(a[:-1], a[1:])) new_joins = ( new_lines.groupby(["origLineID", "wbID"]) .lineID.apply(pairs) .apply(pd.Series) .reset_index() .rename(columns={0: "upstream_id", 1: "downstream_id"}) .join( flowlines[["NHDPlusID", "loop"]].rename( columns={"NHDPlusID": "upstream"} ), on="origLineID", ) ) # NHDPlusID is same for both sides new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" new_joins = new_joins[ [ "upstream", "downstream", "upstream_id", "downstream_id", "type", "loop", ] ] joins = joins.append( new_joins, ignore_index=True, sort=False ).sort_values(["downstream_id", "upstream_id"]) ### Update flowlines # remove originals now replaced by cut versions here flowlines = ( flowlines.loc[~flowlines.index.isin(new_lines.origLineID)] .reset_index() .append( new_lines[["lineID"] + list(flowlines.columns) + ["waterbody"]], ignore_index=True, sort=False, ) .sort_values("lineID") .set_index("lineID") ) # End cut geometries # Update waterbody bool for other flowlines based on those that completely intersected # above flowlines.loc[ flowlines.index.isin(wb_joins.index.get_level_values(0).unique()), "waterbody" ] = True flowlines.waterbody = flowlines.waterbody.fillna(False) ### Update waterbodies and calculate flowline stats wb_joins = wb_joins.reset_index() stats = ( wb_joins.join(flowlines.length.rename("flowlineLength"), on="lineID") .groupby("wbID") .flowlineLength.sum() .astype("float32") ) waterbodies = waterbodies.loc[waterbodies.index.isin(wb_joins.wbID)].join(stats) print("Done cutting flowlines by waterbodies in {:.2f}s".format(time() - start)) return flowlines, joins, waterbodies, wb_joins