def test_line_locate_point_empty(normalized): assert np.isnan( pygeos.line_locate_point(line_string, empty_point, normalized=normalized) ) assert np.isnan( pygeos.line_locate_point(empty_line_string, point, normalized=normalized) )
def cut_line_at_points(line, cut_points, tolerance=1e-6): """Cut a pygeos line geometry at points. If there are no interior points, the original line will be returned. Parameters ---------- line : pygeos Linestring cut_points : list-like of pygeos Points will be projected onto the line; those interior to the line will be used to cut the line in to new segments. tolerance : float, optional (default: 1e-6) minimum distance from endpoints to consider the points interior to the line. Returns ------- MultiLineStrings (or LineString, if unchanged) """ if not pg.get_type_id(line) == 1: raise ValueError("line is not a single linestring") vertices = pg.get_point(line, range(pg.get_num_points(line))) offsets = pg.line_locate_point(line, vertices) cut_offsets = pg.line_locate_point(line, cut_points) # only keep those that are interior to the line and ignore those very close # to endpoints or beyond endpoints cut_offsets = cut_offsets[(cut_offsets > tolerance) & (cut_offsets < offsets[-1] - tolerance)] if len(cut_offsets) == 0: # nothing to cut, return original return line # get coordinates of new vertices from the cut points (interpolated onto the line) cut_offsets.sort() # add in the last coordinate of the line cut_offsets = np.append(cut_offsets, offsets[-1]) # TODO: convert this to a pygos ufunc coords = pg.get_coordinates(line) cut_coords = pg.get_coordinates( pg.line_interpolate_point(line, cut_offsets)) lines = [] orig_ix = 0 for cut_ix in range(len(cut_offsets)): offset = cut_offsets[cut_ix] segment = [] if cut_ix > 0: segment = [cut_coords[cut_ix - 1]] while offsets[orig_ix] < offset: segment.append(coords[orig_ix]) orig_ix += 1 segment.append(cut_coords[cut_ix]) lines.append(pg.linestrings(segment)) return pg.multilinestrings(lines)
def project(data, other, normalized=False): if compat.USE_PYGEOS: try: return pygeos.line_locate_point(data, other, normalized=normalized) except TypeError: # support for pygeos<0.9 return pygeos.line_locate_point(data, other, normalize=normalized) else: return _binary_op("project", data, other, normalized=normalized)
def test_line_locate_point_invalid_geometry(normalized): with pytest.raises(pygeos.GEOSException): pygeos.line_locate_point(line_string, line_string, normalized=normalized) with pytest.raises(pygeos.GEOSException): pygeos.line_locate_point(polygon, point, normalized=normalized)
def test_line_locate_point_empty(): assert np.isnan(pygeos.line_locate_point(line_string, empty_point)) assert np.isnan(pygeos.line_locate_point(empty_line_string, point))
def test_line_locate_point_none(): assert np.isnan(pygeos.line_locate_point(line_string, None)) assert np.isnan(pygeos.line_locate_point(None, point))
def test_line_locate_point_geom_array2(): points = pygeos.points([[0, 0], [1, 0]]) actual = pygeos.line_locate_point(line_string, points) np.testing.assert_allclose(actual, [0.0, 1.0])
def test_line_locate_point_geom_array(): point = pygeos.points(0, 1) actual = pygeos.line_locate_point([line_string, linear_ring], point) np.testing.assert_allclose(actual, [0.0, 3.0])
def project(data, other, normalized=False): if compat.USE_PYGEOS: return pygeos.line_locate_point(data, other, normalize=normalized) else: return _binary_op("project", data, other, normalized=normalized)
def find_dam_face_from_waterbody(waterbody, drain_pt): total_area = pg.area(waterbody) ring = pg.get_exterior_ring(pg.normalize(waterbody)) total_length = pg.length(ring) num_pts = pg.get_num_points(ring) - 1 # drop closing coordinate vertices = pg.get_point(ring, range(num_pts)) ### Extract line segments that are no more than 1/3 coordinates of polygon # starting from the vertex nearest the drain # note: lower numbers are to the right tree = pg.STRtree(vertices) ix = tree.nearest(drain_pt)[1][0] side_width = min(num_pts // 3, MAX_SIDE_PTS) left_ix = ix + side_width right_ix = ix - side_width # extract these as a left-to-write line; pts = vertices[max(right_ix, 0):min(num_pts, left_ix)][::-1] if left_ix >= num_pts: pts = np.append(vertices[0:left_ix - num_pts][::-1], pts) if right_ix < 0: pts = np.append(pts, vertices[num_pts + right_ix:num_pts][::-1]) coords = pg.get_coordinates(pts) if len(coords) > 2: # first run a simplification process to extract the major shape and bends # then run the straight line algorithm simp_coords, simp_ix = simplify_vw( coords, min(MAX_SIMPLIFY_AREA, total_area / 100)) if len(simp_coords) > 2: keep_coords, ix = extract_straight_segments( simp_coords, max_angle=MAX_STRAIGHT_ANGLE, loops=5) keep_ix = simp_ix.take(ix) else: keep_coords = simp_coords keep_ix = simp_ix else: keep_coords = coords keep_ix = np.arange(len(coords)) ### Calculate the length of each run and drop any that are not sufficiently long lengths = segment_length(keep_coords) ix = (lengths >= MIN_DAM_WIDTH) & (lengths / total_length < MAX_WIDTH_RATIO) pairs = np.dstack([keep_ix[:-1][ix], keep_ix[1:][ix]])[0] # since ranges are ragged, we have to do this in a loop instead of vectorized segments = [] for start, end in pairs: segments.append(pg.linestrings(coords[start:end + 1])) segments = np.array(segments) # only keep the segments that are close to the drain segments = segments[ pg.intersects(segments, pg.buffer(drain_pt, MAX_DRAIN_DIST)), ] if not len(segments): return segments # only keep those where the drain is interior to the line pos = pg.line_locate_point(segments, drain_pt) lengths = pg.length(segments) ix = (pos >= MIN_INTERIOR_DIST) & (pos <= (lengths - MIN_INTERIOR_DIST)) return segments[ix]
def snap_to_flowlines(df, to_snap): """Snap to nearest flowline, within tolerance Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ for region, HUC2s in list(REGION_GROUPS.items()): region_start = time() print("\n----- {} ------\n".format(region)) print("Reading flowlines...") flowlines = from_geofeather( nhd_dir / "clean" / region / "flowlines.feather" ).set_index("lineID") in_region = to_snap.loc[to_snap.HUC2.isin(HUC2s)] print( "Selected {:,} barriers in region to snap against {:,} flowlines".format( len(in_region), len(flowlines) ) ) if len(in_region) == 0: print("No barriers in region to snap") continue print("Finding nearest flowlines...") # TODO: can use near instead of nearest, and persist list of near lineIDs per barrier # so that we can construct subnetworks with just those lines = nearest( in_region.geometry, flowlines.geometry, in_region.snap_tolerance ) lines = lines.join(in_region.geometry).join( flowlines.geometry.rename("line"), on="lineID", ) # project the point to the line, # find out its distance on the line, # then interpolate its new coordinates lines["geometry"] = pg.line_interpolate_point( lines.line, pg.line_locate_point(lines.line, lines.geometry) ) ix = lines.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = lines.geometry df.loc[ix, "snap_dist"] = lines.distance df.loc[ix, "snap_ref_id"] = lines.lineID df.loc[ix, "lineID"] = lines.lineID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of flowline", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "{:,} barriers snapped in region in {:.2f}s".format( len(ix), time() - region_start ) ) # TODO: flag those that joined to loops return df, to_snap
def snap_to_flowlines(df, to_snap): """Snap to nearest flowline, within tolerance Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. If dams are within SNAP_ENDPOINT_TOLERANCE of the endpoints of the line, they will be snapped to the endpoint instead of closest point on line. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ print("=================\nSnapping to flowlines...") for huc2 in sorted(to_snap.HUC2.unique()): region_start = time() print(f"\n----- {huc2} ------") in_huc2 = to_snap.loc[to_snap.HUC2 == huc2].copy() flowlines = gp.read_feather( nhd_dir / "clean" / huc2 / "flowlines.feather", columns=["geometry", "lineID"], ).set_index("lineID") print( f"HUC {huc2} selected {len(in_huc2):,} barriers in region to snap against {len(flowlines):,} flowlines" ) lines = nearest( pd.Series(in_huc2.geometry.values.data, index=in_huc2.index), pd.Series(flowlines.geometry.values.data, index=flowlines.index), in_huc2.snap_tolerance.values, ) lines = lines.join(in_huc2.geometry).join( flowlines.geometry.rename("line"), on="lineID", ) # project the point to the line, # find out its distance on the line, lines["line_pos"] = pg.line_locate_point(lines.line.values.data, lines.geometry.values.data) # if within tolerance of start point, snap to start ix = lines["line_pos"] <= SNAP_ENDPOINT_TOLERANCE lines.loc[ix, "line_pos"] = 0 # if within tolerance of endpoint, snap to end end = pg.length(lines.line.values.data) ix = lines["line_pos"] >= end - SNAP_ENDPOINT_TOLERANCE lines.loc[ix, "line_pos"] = end[ix] # then interpolate its new coordinates lines["geometry"] = pg.line_interpolate_point(lines.line.values.data, lines["line_pos"]) ix = lines.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = lines.geometry df.loc[ix, "snap_dist"] = lines.distance df.loc[ix, "snap_ref_id"] = lines.lineID df.loc[ix, "lineID"] = lines.lineID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of flowline", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print("{:,} barriers snapped in region in {:.2f}s".format( len(ix), time() - region_start)) # TODO: flag those that joined to loops return df, to_snap
def test_line_locate_point_none(normalized): assert np.isnan( pygeos.line_locate_point(line_string, None, normalized=normalized)) assert np.isnan( pygeos.line_locate_point(None, point, normalized=normalized))
def cut_flowlines_at_points(flowlines, joins, points, next_lineID): """General method for cutting flowlines at points and updating joins. Only points >= SNAP_ENDPOINT_TOLERANCE are used to cut lines. Lines are cut starting at the upstream end; the original ordering of points per line is not preserved. Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins points : GeoSeries points to cut flowlines, must be indexed to join against flowlines; one record per singular Point. next_lineID : int id of next flowline to be created Returns ------- (GeoDataFrame, DataFrame) Updated flowlines and joins. Note: flowlines have a "new" column to identify new flowlines created here. """ df = flowlines.join(points.rename("point"), how="inner") df["pos"] = pg.line_locate_point(df.geometry.values.data, df.point.values.data) # only keep cut points that are sufficiently interior to the line # (i.e., not too close to endpoints) ix = (df.pos >= SNAP_ENDPOINT_TOLERANCE) & ( (df["length"] - df.pos).abs() >= SNAP_ENDPOINT_TOLERANCE) # sort remaining cut points in ascending order on their lines df = df.loc[ix].sort_values(by=["lineID", "pos"]) # convert to plain DataFrame so that we can extract coords grouped = pd.DataFrame( df.groupby("lineID").agg({ "geometry": "first", "pos": list })) grouped["geometry"] = grouped.geometry.values.data outer_ix, inner_ix, lines = cut_lines_at_points( grouped.geometry.apply(lambda x: pg.get_coordinates(x)).values, grouped.pos.apply(np.array).values, ) lines = np.asarray(lines) new_flowlines = gp.GeoDataFrame({ "lineID": (next_lineID + np.arange(len(outer_ix))).astype("uint32"), "origLineID": grouped.index.take(outer_ix), "geometry": lines, "length": pg.length(lines).astype("float32"), "sinuosity": calculate_sinuosity(lines).astype("float32"), }).join( flowlines.drop( columns=[ "geometry", "lineID", "xmin", "ymin", "xmax", "ymax", "length", "sinuosity", ], errors="ignore", ), on="origLineID", ) ### Update flowline joins # transform new lines to create new joins at the upstream / downstream most # points of the original line l = new_flowlines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created joins = update_joins( joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) ### Create new line joins for any that weren't inserted above # Transform all groups of new line IDs per original lineID # into joins structure atts = (new_flowlines.groupby("origLineID")[[ "NHDPlusID", "loop", "HUC4" ]].first().rename(columns={"NHDPlusID": "upstream"})) # function to make upstream / downstream side of join pairs = lambda a: pd.Series(zip(a[:-1], a[1:])) new_joins = (l.apply(pairs).apply( pd.Series).reset_index().rename(columns={ 0: "upstream_id", 1: "downstream_id" }).join(atts, on="origLineID")) # NHDPlusID is same for both sides new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" # new joins do not terminate in marine, so marine should always be false new_joins["marine"] = False new_joins = new_joins[[ "upstream", "downstream", "upstream_id", "downstream_id", "type", "loop", "marine", "HUC4", ]] joins = (joins.append(new_joins, ignore_index=True, sort=False).sort_values([ "downstream", "upstream", "downstream_id", "upstream_id" ]).reset_index(drop=True)) remove_ids = new_flowlines.origLineID.unique() flowlines["new"] = False new_flowlines["new"] = True flowlines = ( flowlines.loc[~flowlines.index.isin(remove_ids)].reset_index().append( new_flowlines.drop(columns=["origLineID"]), ignore_index=True, sort=False).set_index("lineID")) return flowlines, joins
def cut_flowlines_at_barriers(flowlines, joins, barriers, next_segment_id=None): """Cut flowlines by barriers. Parameters ---------- flowlines : GeoDataFrame ALL flowlines for region. barriers : GeoDataFrame Barriers that will be used to cut flowlines. joins : DataFrame Joins between flowlines (upstream, downstream pairs). next_segment_id : int, optional Used as starting point for IDs of new segments created by cutting flowlines. Returns ------- GeoDataFrame, DataFrame, DataFrame updated flowlines, updated joins, barrier joins (upstream / downstream flowline ID per barrier) """ start = time() print(f"Starting number of segments: {len(flowlines):,}") print(f"Cutting in {len(barriers):,} barriers") # Our segment ids are ints, so just increment from the last one we had from NHD if next_segment_id is None: next_segment_id = int(flowlines.index.max() + 1) # join barriers to lines and extract those that have segments (via inner join) segments = (flowlines[["lineID", "NHDPlusID", "geometry"]].rename(columns={ "geometry": "flowline" }).join( barriers[["geometry", "barrierID", "lineID"]].set_index("lineID").rename( columns={"geometry": "barrier"}), how="inner", )) # Calculate the position of each barrier on each segment. # Barriers are on upstream or downstream end of segment if they are within # SNAP_ENDPOINT_TOLERANCE of the ends. Otherwise, they are splits segments["linepos"] = pg.line_locate_point(segments.flowline.values.data, segments.barrier.values.data) ### Upstream and downstream endpoint barriers segments["on_upstream"] = segments.linepos <= SNAP_ENDPOINT_TOLERANCE segments["on_downstream"] = ( segments.linepos >= pg.length(segments.flowline.values.data) - SNAP_ENDPOINT_TOLERANCE) # if line length is < SNAP_ENDPOINT_TOLERANCE, then barrier could be tagged # to both sides, which is incorrect. Default to on_downstream. segments.loc[segments.on_upstream & segments.on_downstream, "on_upstream"] = False print( f"{segments.on_upstream.sum():,} barriers on upstream point of their segments" ) print( f"{segments.on_downstream.sum():,} barriers on downstream point of their segments" ) # Barriers on upstream endpoint: # their upstream_id is the upstream_id(s) of their segment from joins, # and their downstream_is is the segment they are on. # NOTE: a barrier may have multiple upstreams if it occurs at a fork in the network. # All terminal upstreams should be already coded as 0 in joins, but just in case # we assign N/A to 0. upstream_barrier_joins = ((segments.loc[segments.on_upstream][[ "barrierID", "lineID" ]].rename(columns={ "lineID": "downstream_id" }).join(joins.set_index("downstream_id").upstream_id, on="downstream_id")).fillna(0).astype("uint64")) # Barriers on downstream endpoint: # their upstream_id is the segment they are on and their downstream_id is the # downstream_id of their segment from the joins. # Some downstream_ids may be missing if the barrier is on the downstream-most point of the # network (downstream terminal) and further downstream segments were removed due to removing # coastline segments. downstream_barrier_joins = ((segments.loc[segments.on_downstream][[ "barrierID", "lineID" ]].rename(columns={ "lineID": "upstream_id" }).join(joins.set_index("upstream_id").downstream_id, on="upstream_id")).fillna(0).astype("uint64")) barrier_joins = upstream_barrier_joins.append(downstream_barrier_joins, ignore_index=True, sort=False).set_index( "barrierID", drop=False) ### Split segments have barriers that are not at endpoints split_segments = segments.loc[~(segments.on_upstream | segments.on_downstream)] # join in count of barriers that SPLIT this segment split_segments = split_segments.join( split_segments.groupby(level=0).size().rename("barriers")) print( f"{(split_segments.barriers == 1).sum():,} segments to cut have one barrier" ) print( f"{(split_segments.barriers > 1).sum():,} segments to cut have more than one barrier" ) # ordinate the barriers by their projected distance on the line # Order this so we are always moving from upstream end to downstream end split_segments = split_segments.rename_axis("idx").sort_values( by=["idx", "linepos"], ascending=True) # Convert to DataFrame so that geometry cols are arrays of pygeos geometries tmp = pd.DataFrame(split_segments.copy()) tmp.flowline = tmp.flowline.values.data tmp.barrier = tmp.barrier.values.data tmp["pos"] = pg.line_locate_point(tmp.flowline.values, tmp.barrier.values) # Group barriers by line so that we can split geometries in one pass grouped = ( tmp[[ "lineID", "NHDPlusID", "barrierID", "barriers", "flowline", "barrier", "pos", ]].sort_values(by=["lineID", "pos"]).groupby("lineID").agg({ "lineID": "first", "NHDPlusID": "first", "flowline": "first", "barrierID": list, "barriers": "first", # "barrier": list, # TODO: remove "pos": list, })) # cut line for all barriers outer_ix, inner_ix, lines = cut_lines_at_points( grouped.flowline.apply(lambda x: pg.get_coordinates(x)).values, grouped.pos.apply(np.array).values, ) lines = np.asarray(lines) new_flowlines = gp.GeoDataFrame({ "lineID": (next_segment_id + np.arange(len(outer_ix))).astype("uint32"), "origLineID": grouped.index.take(outer_ix), "position": inner_ix, "geometry": lines, "length": pg.length(lines).astype("float32"), "sinuosity": calculate_sinuosity(lines).astype("float32"), }).join( flowlines.drop( columns=[ "geometry", "lineID", "xmin", "ymin", "xmax", "ymax", "length", "sinuosity", ], errors="ignore", ), on="origLineID", ) # transform new segments to create new joins l = new_flowlines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created updated_joins = update_joins(joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id") # also need to update any barrier joins already created for those on endpoints barrier_joins = update_joins( barrier_joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) # For all new interior joins, create upstream & downstream ids per original line upstream_side = (new_flowlines.loc[~new_flowlines.lineID.isin(last)][[ "origLineID", "position", "lineID" ]].set_index(["origLineID", "position"]).rename(columns={"lineID": "upstream_id"})) downstream_side = new_flowlines.loc[~new_flowlines.lineID.isin(first)][[ "origLineID", "position", "lineID" ]].rename(columns={"lineID": "downstream_id"}) downstream_side.position = downstream_side.position - 1 downstream_side = downstream_side.set_index(["origLineID", "position"]) new_joins = (grouped.barrierID.apply( pd.Series).stack().astype("uint32").reset_index().rename(columns={ "lineID": "origLineID", "level_1": "position", 0: "barrierID" }).set_index([ "origLineID", "position" ]).join(upstream_side).join(downstream_side).reset_index().join( grouped.NHDPlusID.rename("upstream"), on="origLineID")) new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" new_joins["marine"] = False updated_joins = updated_joins.append( new_joins[[ "upstream", "downstream", "upstream_id", "downstream_id", "type", "marine" ]], ignore_index=True, sort=False, ).sort_values(["downstream_id", "upstream_id"]) barrier_joins = (barrier_joins.append( new_joins[["barrierID", "upstream_id", "downstream_id"]], ignore_index=True, sort=False, ).set_index("barrierID", drop=False).astype("uint32")) # any join that is upstream of a barrier cannot be marine updated_joins.loc[ updated_joins.marine & updated_joins.upstream_id.isin(barrier_joins.upstream_id.unique()), "marine", ] = False # extract flowlines that are not split by barriers and merge in new flowlines unsplit_segments = flowlines.loc[~flowlines.index.isin(split_segments.index )] updated_flowlines = unsplit_segments.append( new_flowlines.drop(columns=["origLineID", "position"]), ignore_index=True, sort=False, ).set_index("lineID", drop=False) print(f"Done cutting flowlines in {time() - start:.2f}s") return updated_flowlines, updated_joins, barrier_joins