def test_set_precision_intersection(): """Operations should use the most precise presision grid size of the inputs""" box1 = pygeos.normalize(pygeos.box(0, 0, 0.9, 0.9)) box2 = pygeos.normalize(pygeos.box(0.75, 0, 1.75, 0.75)) assert pygeos.get_precision(pygeos.intersection(box1, box2)) == 0 # GEOS will use and keep the most precise precision grid size box1 = pygeos.set_precision(box1, 0.5) box2 = pygeos.set_precision(box2, 1) out = pygeos.intersection(box1, box2) assert pygeos.get_precision(out) == 0.5 assert pygeos.equals(out, pygeos.Geometry("LINESTRING (1 1, 1 0)"))
def test_intersection_nan(): actual = pygeos.intersection( np.array([point, np.nan, np.nan, point, None, None, point]), np.array([np.nan, point, np.nan, None, point, None, point]), ) assert pygeos.equals(actual[-1], point) assert pygeos.is_empty(actual[:-1]).all()
def create_final_od_grid(df,height_div): height = numpy.sqrt(pygeos.area(df.geometry)/height_div).values[0] grid = pd.DataFrame(create_grid(create_bbox(df),height),columns=['geometry']) #clip grid of bbox to grid of the actual spatial exterior of the country clip_grid = pygeos.intersection(grid,df.geometry) clip_grid = clip_grid.loc[~pygeos.is_empty(clip_grid.geometry)] # turn to shapely geometries again for zonal stats clip_grid.geometry = pygeos.to_wkb(clip_grid.geometry) clip_grid.geometry = clip_grid.geometry.apply(loads) clip_grid = gpd.GeoDataFrame(clip_grid) # get total population per grid cell clip_grid['tot_pop'] = clip_grid.geometry.apply(lambda x: zonal_stats(x,world_pop,stats="sum")) clip_grid['tot_pop'] = clip_grid['tot_pop'].apply(lambda x: x[0]['sum']) # remove cells in the grid that have no population data clip_grid = clip_grid.loc[~pd.isna(clip_grid.tot_pop)] clip_grid = clip_grid.loc[clip_grid.tot_pop > 100] clip_grid.reset_index(inplace=True,drop=True) clip_grid.geometry = clip_grid.geometry.centroid clip_grid['GID_0'] = GID_0 clip_grid['grid_height'] = height return clip_grid
def naive_compute_iou_matrix(sorted_detections, ground_truths): """Computes the iou scores between all pairs of geometries in a naive fashion. Args: sorted_detections (ndarray, list) : A ndarray of detections stored as: * Bounding boxes for a given class where each row is a detection stored as: ``[BoundingBox, confidence]`` * Polygons for a given class where each row is a detection stored as: ``[Polygon, confidence]`` * Points for a given class where each row is a detection stored as: ``[Point, confidence]`` ground_truths (ndarray,list) : A ndarray of ground truth stored as: * Bounding boxes for a given class where each row is a ground truth stored as: ``[BoundingBox]`` * Polygons for a given class where each row is a ground truth stored as: ``[Polygon]`` * Points for a given class where each row is a ground truth stored as: ``[Point]`` Returns: ndarray : An IoU matrix (#detections, #ground truth) """ # We prepare the IoU matrix (#detection, #gt) iou = np.zeros((sorted_detections.shape[0], len(ground_truths))) # Naive iterative IoU matrix construction (Note: we iterate over the sorted detections) for k, ground_truth in enumerate(ground_truths): for m, detection in enumerate(sorted_detections): iou[m, k] = area(intersection(detection[0], ground_truth[0])) / area(union(detection[0], ground_truth[0])) return iou
def _tess( self, ix, enclosure, buildings, query_inp, query_res, threshold, unique_id, **kwargs, ): poly = enclosure.values.data[ix] blg = buildings.iloc[query_res[query_inp == ix]] within = blg[ pygeos.area(pygeos.intersection(blg.geometry.values.data, poly)) > (pygeos.area(blg.geometry.values.data) * threshold)] if len(within) > 1: tess = self._morphological_tessellation( within, unique_id, poly, shrink=self.shrink, segment=self.segment, verbose=False, check=False, ) tess[self.enclosure_id] = ix return tess return gpd.GeoDataFrame({ self.enclosure_id: ix, unique_id: None }, geometry=[poly], index=[0])
def intersection(left, right): """Intersect the geometries from the left with the right. New, intersected geometries are stored in "geometry_right". Uses spatial index operations for faster operations. Wholly contained geometries from right are copied intact, only those that intersect but are not wholly contained are intersected. Parameters ---------- left : GeoDataFrame right : GeoDataFrame Returns ------- DataFrame output geometries are in "geometry_right" """ left_series = pd.Series(left.geometry.values.data, index=left.index) right_series = pd.Series(right.geometry.values.data, index=right.index) intersects = sjoin_geometry(left_series, right_series, predicate="intersects") if not len(intersects): # empty dataframe with correct columns return left.join(intersects, how="inner").join(right, on="index_right", rsuffix="_right") # find the subset that are wholly contained contains = sjoin_geometry( left_series.loc[intersects.index.unique()], right_series.loc[intersects.unique()], predicate="contains", ) # any geometries that are completely contained can be copied intact out = left.join(contains, how="inner").join(right, on="index_right", rsuffix="_right") # the remainder need to be intersected rest = intersects.loc[~intersects.isin(contains)] rest = left.join(rest, how="inner").join(right, on="index_right", rsuffix="_right") # have to add .values to prevent conversion to None rest["geometry_right"] = gp.GeoSeries( pg.intersection(rest.geometry.values.data, rest.geometry_right.values.data)).values return out.append(rest, ignore_index=False)
def _overlay(a, b, return_indices=False): """ Compute geometries from overlaying a onto b """ tree = pygeos.STRtree(a) bix, aix = tree.query_bulk(b) overlay = pygeos.intersection(a[aix], b[bix]) if return_indices: return aix, bix, overlay return overlay
def trim_similarity_matrix(self, similarity_matrix, detections, ground_truths, label_mean_area=None): r"""Compute an array containing the indices in columns of similarity passing the first trimming. Here a detection/ground truth pair is kept if the detection :class:`~playground_metrics.utils.geometry_utils.geometry.Point` is within the ground truth :class:`~playground_metrics.utils.geometry_utils.geometry.BoundingBox` or :class:`~playground_metrics.utils.geometry_utils.geometry.Polygon` Args: similarity_matrix: The similarity matrix between detections and ground truths : dimension (#detection, #gt) detections (ndarray, list) : A ndarray of detections stored as: * Bounding boxes for a given class where each row is a detection stored as: ``[BoundingBox, confidence]`` * Polygons for a given class where each row is a detection stored as: ``[Polygon, confidence]`` * Points for a given class where each row is a detection stored as: ``[Point, confidence]`` ground_truths (ndarray,list) : A ndarray of ground truth stored as: * Bounding boxes for a given class where each row is a ground truth stored as: ``[BoundingBox]`` * Polygons for a given class where each row is a ground truth stored as: ``[Polygon]`` label_mean_area (float) : Optional, default to ``None``. The mean area for each label in the dataset. Returns: ndarray: An array of dimension (2, N) where each column is a tuple (detection, ground truth) describing a potential match. To be more precise, each match-tuple in the array corresponds to a position in the similarity matrix which will be used by the match algorithm to compute the final match. """ potential = np.stack(np.nonzero(similarity_matrix != -np.Inf)) potential = potential[:, np.argsort( np.nonzero(similarity_matrix != -np.Inf)[0])] trim = [] for i in range(potential.shape[1]): r, c = potential[:, i] if np.all( is_empty( intersection(detections[r, 0], ground_truths[c, 0]))): trim.append(i) return np.delete(potential, trim, axis=1)
def country_grid_gdp_filled(trans_network, country, data_path, rough_grid_split=100, from_main_graph=False): """[summary] Args: trans_network ([type]): [description] rough_grid_split (int, optional): [description]. Defaults to 100. Returns: [type]: [description] """ if from_main_graph == True: node_df = trans_network.copy() envelop = pygeos.envelope( pygeos.multilinestrings(node_df.geometry.values)) height = np.sqrt(pygeos.area(envelop) / rough_grid_split) else: node_df = trans_network.nodes.copy() node_df.geometry, approximate_crs = convert_crs(node_df) envelop = pygeos.envelope( pygeos.multilinestrings(node_df.geometry.values)) height = np.sqrt(pygeos.area(envelop) / rough_grid_split) gdf_admin = pd.DataFrame(create_grid(create_bbox(node_df), height), columns=['geometry']) #load data and convert to pygeos country_shape = gpd.read_file(os.path.join(data_path, 'GADM', 'gadm36_levels.gpkg'), layer=0) country_shape = pd.DataFrame( country_shape.loc[country_shape.GID_0 == country]) country_shape.geometry = pygeos.from_shapely(country_shape.geometry) gdf_admin = pygeos.intersection(gdf_admin, country_shape.geometry) gdf_admin = gdf_admin.loc[~pygeos.is_empty(gdf_admin.geometry)] gdf_admin['centroid'] = pygeos.centroid(gdf_admin.geometry) gdf_admin['km2'] = area(gdf_admin) gdf_admin['gdp'] = get_gdp_values(gdf_admin, data_path) gdf_admin = gdf_admin.loc[gdf_admin.gdp > 0].reset_index() gdf_admin['gdp_area'] = gdf_admin.gdp / gdf_admin['km2'] return gdf_admin
def create_district_boundaries( census_data: pd.DataFrame, *, clip_to: pygeos.Geometry = None) -> gpd.GeoDataFrame: """Estimates district boundaries from group locations. Aims to estimate district boundaries from Group points, using the Voronoi diagram method. Args: census_data: Dataframe with census data clip_to: Optional area to clip results to. Must be in WGS84 projection. Returns: GeoDataFrame with district IDs and polygons Todo: Spatial transforms add 20x overhead, but buffering relies on them to work. Fix. """ # Finds and de-duplicates all the records with valid postcodes in the Scout Census all_locations = census_data.loc[ census_data[scout_census.column_labels.VALID_POSTCODE], ["D_ID", "lat", "long"]] all_locations = all_locations.drop_duplicates( subset=["lat", "long"]).reset_index(drop=True) # Create points from lat / long co-ordinates above points = gpd.points_from_xy(all_locations["long"], all_locations["lat"], crs=constants.WGS_84) # convert the co-ordinate reference system into OS36 (British National # Grid). This is uses (x-y) coordinates in metres, rather than (long, lat) # coordinates, meaning that we can operate in metres from now on. points = points.to_crs(epsg=constants.BNG).data districts = gpd.GeoSeries(merge_to_districts(all_locations["D_ID"], points), crs=constants.BNG).to_crs(epsg=constants.WGS_84) if clip_to is not None: districts.geometry.array.data = pygeos.intersection( districts.geometry.array.data, clip_to) return districts.reset_index()[[ "geometry", "D_ID" ]] # return district IDs (the index) as a field/column
def naive_compute_point_in_box_distance_similarity_matrix(sorted_detections, ground_truths): """Computes a similarity based on euclidean distance between all pairs of geometries in a naive fashion. Args: sorted_detections (ndarray, list) : A ndarray of detections stored as: * Bounding boxes for a given class where each row is a detection stored as: ``[BoundingBox, confidence]`` * Polygons for a given class where each row is a detection stored as: ``[Polygon, confidence]`` * Points for a given class where each row is a detection stored as: ``[Point, confidence]`` ground_truths (ndarray,list) : A ndarray of ground truth stored as: * Bounding boxes for a given class where each row is a ground truth stored as: ``[BoundingBox]`` * Polygons for a given class where each row is a ground truth stored as: ``[Polygon]`` * Points for a given class where each row is a ground truth stored as: ``[Point]`` Returns: ndarray : An similarity matrix (#detections, #ground truth) """ # We prepare the distance matrix (#detection, #gt) distance_matrix = np.zeros((sorted_detections.shape[0], len(ground_truths))) # Naive iterative distance matrix construction (Note: we iterate over the sorted detections) for k, ground_truth in enumerate(ground_truths): for m, detection in enumerate(sorted_detections): distance_matrix[m, k] = distance(centroid(detection[0]), centroid(ground_truth[0])) for i in range(distance_matrix.shape[0]): for j in range(distance_matrix.shape[1]): if is_empty(intersection(centroid(sorted_detections[i, 0]), ground_truths[j, 0])): distance_matrix[i, j] = np.inf return 1 - distance_matrix
def _extend_line(coords, target, tolerance, snap=True): """ Extends a line geometry to snap on the target within a tolerance. """ if snap: extrapolation = _get_extrapolated_line( coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(), tolerance, ) int_idx = target.sindex.query(extrapolation, predicate="intersects") intersection = pygeos.intersection( target.iloc[int_idx].geometry.values.data, extrapolation) if intersection.size > 0: if len(intersection) > 1: distances = {} ix = 0 for p in intersection: distance = pygeos.distance(p, pygeos.points(coords[-1])) distances[ix] = distance ix = ix + 1 minimal = min(distances.items(), key=operator.itemgetter(1))[0] new_point_coords = pygeos.get_coordinates( intersection[minimal]) else: new_point_coords = pygeos.get_coordinates(intersection[0]) coo = np.append(coords, new_point_coords) new = np.reshape(coo, (int(len(coo) / 2), 2)) return new return coords extrapolation = _get_extrapolated_line( coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(), tolerance, point=True, ) return np.vstack([coords, extrapolation])
def time_intersection(self): pygeos.intersection(self.points, self.polygon)
def time_intersection(self): pygeos.intersection(self.left, self.right)
def time_intersection_prec2(self): pygeos.intersection(self.left, self.right, grid_size=2)
def _intersect_percent(tile, dataset_geoms): """Return the overlap percent.""" inter_areas = area(intersection(tile, dataset_geoms)) return [inter_area / area(tile) for inter_area in inter_areas]
def _intersect_area_on_chunk(geoms1, geoms2): import pygeos areas = pygeos.area(pygeos.intersection(geoms1, geoms2)) return areas
write_dataframe(admin_df, boundaries_dir / "na_admin1.json") admin_df.to_feather(boundaries_dir / "na_admin1.feather") ### Process species ranges print("Processing species ranges...") # create lookup of species scientific name to code sci_name_lut = {value["SNAME"]: key for key, value in SPECIES.items()} range_df = read_dataframe("data/boundaries/src/species_ranges.shp") # split hoary bat into Hawaiian vs mainland laci = range_df.loc[range_df.SCI_NAME == "Lasiurus cinereus"] Hawaii = pg.box(*HAWAII_BOUNDS) haba = laci.copy() # add new geometry for haba haba.geometry = pg.intersection(laci.geometry.values.data, Hawaii) haba.SCI_NAME = SPECIES["haba"]["SNAME"] haba.COMMON_NAM = SPECIES["haba"]["CNAME"] range_df = range_df.append(haba, ignore_index=True, sort=False) # clip out Hawaii from laci range_df.loc[range_df.SCI_NAME == "Lasiurus cinereus", "geometry"] = pg.difference(laci.geometry.values.data, Hawaii) # add in alias of Myotis melanorhinus to Myotis ciliolabrum sci_name_lut["Myotis melanorhinus"] = "myci" range_df["species"] = range_df.SCI_NAME.map(sci_name_lut) # dissolve on species range_df = range_df.dissolve(by="species").reset_index()
def __init__(self, left, right, heights=None, distance=10, tick_length=50, verbose=True): self.left = left self.right = right self.distance = distance self.tick_length = tick_length pygeos_lines = left.geometry.values.data list_points = np.empty((0, 2)) ids = [] end_markers = [] lengths = pygeos.length(pygeos_lines) for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)): pts = pygeos.line_interpolate_point( line, np.linspace(0, length, num=int((length) // distance))) list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0) if len(pts) > 1: ids += [ix] * len(pts) * 2 markers = [True] + ([False] * (len(pts) - 2)) + [True] end_markers += markers elif len(pts) == 1: end_markers += [True] ids += [ix] * 2 ticks = [] for num, (pt, end) in enumerate(zip(list_points, end_markers), 1): if end: ticks.append([pt, pt]) ticks.append([pt, pt]) else: angle = self._getAngle(pt, list_points[num]) line_end_1 = self._getPoint1(pt, angle, tick_length / 2) angle = self._getAngle(line_end_1, pt) line_end_2 = self._getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) ticks = pygeos.linestrings(ticks) inp, res = right.sindex.query_bulk(ticks, predicate="intersects") intersections = pygeos.intersection(ticks[inp], right.geometry.values.data[res]) distances = pygeos.distance(intersections, pygeos.points(list_points[inp // 2])) inp_uni, inp_cts = np.unique(inp, return_counts=True) splitter = np.cumsum(inp_cts)[:-1] dist_per_res = np.split(distances, splitter) inp_per_res = np.split(res, splitter) min_distances = [] min_inds = [] for dis, ind in zip(dist_per_res, inp_per_res): min_distances.append(np.min(dis)) min_inds.append(ind[np.argmin(dis)]) dists = np.zeros((len(ticks), )) dists[:] = np.nan dists[inp_uni] = min_distances if heights is not None: if isinstance(heights, str): heights = self.heights = right[heights] elif not isinstance(heights, pd.Series): heights = self.heights = pd.Series(heights) blgs = np.zeros((len(ticks), )) blgs[:] = None blgs[inp_uni] = min_inds do_heights = True else: do_heights = False ids = np.array(ids) widths = [] openness = [] deviations = [] heights_list = [] heights_deviations_list = [] for i in range(len(left)): f = ids == i s = dists[f] lefts = s[::2] rights = s[1::2] left_mean = np.nanmean( lefts) if ~np.isnan(lefts).all() else tick_length / 2 right_mean = (np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2) widths.append(np.mean([left_mean, right_mean]) * 2) openness.append(np.isnan(s).sum() / (f).sum()) deviations.append(np.nanstd(s)) if do_heights: b = blgs[f] h = heights.iloc[b[~np.isnan(b)]] heights_list.append(h.mean()) heights_deviations_list.append(h.std()) self.w = pd.Series(widths, index=left.index) self.wd = pd.Series(deviations, index=left.index).fillna( 0) # fill for empty intersections self.o = pd.Series(openness, index=left.index).fillna(1) if do_heights: self.h = pd.Series(heights_list, index=left.index).fillna( 0) # fill for empty intersections self.hd = pd.Series(heights_deviations_list, index=left.index).fillna( 0) # fill for empty intersections self.p = self.h / self.w.replace(0, np.nan) # replace to avoid np.inf
def summarize_by_areas(df, state, rank_only=False): """Calculate acres by value and area-weighted value for each CHAT field in fields. Parameters ---------- df : GeoDataFrame area(s) of interest state : str, one of ['ok', 'tx'] rank_only : bool (default False) if True, will only calculate areas for CHAT Rank Returns ------- DataFrame columns for total_acres, analysis_acrs, chat_acres, and avg (bare) and _x suffixed fields for each field """ if not df.index.name: df.index.name = "index" index_name = df.index.name df = df.reset_index() chat_df = gp.read_feather(chat_dir / f"{state}chat.feather") fields = ["chatrank"] if not rank_only: fields += [e["id"] for e in INPUTS[f"{state}chat"]["indicators"]] print("Intersecting with CHAT...") chat_df = intersection(df, chat_df) chat_df["acres"] = pg.area(chat_df.geometry_right.values.data) * M2_ACRES chat_df = chat_df.loc[chat_df.acres > 0].copy() if not len(chat_df): return None # total_acres = chat_df.groupby(index_name).geometry.first() total_acres = df.loc[df[index_name].isin(chat_df[index_name])].set_index(index_name) total_acres["total_acres"] = pg.area(total_acres.geometry.values.data) * M2_ACRES results = pd.DataFrame( chat_df.groupby(index_name).acres.sum().rename("chat_acres") ).join(total_acres[["total_acres"]], how="left") # intersect edge units with SE input areas to determine areas outside edge_df = explode( df.loc[ df[index_name].isin( results.loc[(results.chat_acres < results.total_acres - 1)].index ) ].copy()[[index_name, "geometry"]] ) print("Intersecting with input areas, this may take a while...") input_df = gp.read_feather(input_filename).reset_index(drop=True) # this is inverted because input_df performs better if prepared (left side) # note: we don't do intersection() here because of topology errors left = pd.Series(input_df.geometry.values.data, index=input_df.index) right = pd.Series(edge_df.geometry.values.data, index=edge_df.index) intersects = sjoin_geometry(left, right, predicate="intersects") tmp = input_df.loc[intersects.index.unique()] # have to make valid first or fails with topology errors tmp.geometry = pg.make_valid(tmp.geometry.values.data) # clip to general area, otherwise intersection takes a way long time clip_box = pg.box(*pg.total_bounds(edge_df.geometry.values.data)) tmp.geometry = pg.intersection(tmp.geometry.values.data, clip_box) tmp = tmp.join(intersects, how="inner").join( edge_df, on="index_right", rsuffix="_right" ) tmp.geometry_right = pg.intersection( tmp.geometry.values.data, tmp.geometry_right.values.data ) tmp["acres"] = pg.area(tmp.geometry_right.values.data) * M2_ACRES analysis_acres = ( tmp.groupby(index_name) .acres.sum() .round(ACRES_PRECISION) .rename("analysis_acres") ) # join analysis acres back to results results = results.join(analysis_acres) results.loc[results.analysis_acres.isnull(), "analysis_acres"] = results.total_acres area_results = dict() avg_results = dict() for field in fields: # Note: values are categorical, so this will add 0 area values for each category grouped = ( chat_df.groupby([index_name, field]) .acres.sum() .fillna(0) .round(ACRES_PRECISION) .reset_index() ) # create an array of [<acres for value 0>, <acres for value 1>,... ] area_results[field] = grouped.groupby(index_name).acres.apply(np.array) # exclude nodata to calculate area-weighted average values = grouped.loc[grouped[field] > 0].set_index(index_name) total_acres = values.groupby(level=0).acres.sum().rename("total") values = values.join(total_acres) values["wtd_value"] = (values.acres / values.total) * values[field].astype( "uint8" ) avg_results[field] = values.groupby(level=0).wtd_value.sum().round(1) area_results = pd.DataFrame(area_results) avg_results = pd.DataFrame(avg_results) results = results.join(avg_results).fillna(0) for field in fields: # convert areas array to columns s = area_results[field].apply(pd.Series) s.columns = [f"{field}_{c}" for c in s.columns] # drop any that are all 0; these are not present s = s.drop(columns=s.columns[s.max() == 0].tolist()) results = results.join(s) return results
dams = ( pd.DataFrame( sjoin_geometry( pd.Series(dams.geometry.values.data, index=dams.index), pd.Series(flowlines.geometry.values.data, flowlines.index), ).rename("lineID") ) .reset_index() .join(dams.geometry, on="damID") .join(flowlines.geometry.rename("flowline"), on="lineID") ).reset_index(drop=True) print(f"Found {len(dams):,} joins in {time() - join_start:.2f}s") print("Extracting intersecting flowlines...") # Only keep the joins for lines that significantly cross (have a line / multiline and not a point) clipped = pg.intersection(dams.geometry.values.data, dams.flowline.values.data) t = pg.get_type_id(clipped) dams = dams.loc[(t == 1) | (t == 5)].copy() # find all joins for lines that start or end at these dams j = find_joins( joins, dams.lineID.unique(), downstream_col="downstream_id", upstream_col="upstream_id", ) def find_upstreams(ids): if len(ids) == 1: return ids
def test_intersection(): poly1, poly2 = pygeos.box(0, 0, 10, 10), pygeos.box(5, 5, 20, 20) actual = pygeos.intersection(poly1, poly2) expected = pygeos.box(5, 5, 10, 10) assert pygeos.equals(actual, expected)
import numpy as np from pygeos import box, area, intersection polygons_x = box(range(5), 0, range(10, 15), 10) polygons_y = box(0, range(5), 10, range(10, 15)) area(intersection(polygons_x[:, np.newaxis], polygons_y[np.newaxis, :]))
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, wb_joins, out_dir): """ Cut lines by waterbodies. 1. Intersects all previously intersected flowlines with waterbodies. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame wb_joins : DataFrame waterbody flowline joins outdir : pathlib.Path output directory for writing error files, if needed Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() fl_geom = flowlines.loc[flowlines.index.isin(wb_joins.lineID), ["geometry"]].copy() # Many waterbodies have interior polygons (islands); these break the analysis below for cutting lines # Extract a new polygon of just their outer boundary wb_geom = waterbodies[["geometry"]].copy() wb_geom["waterbody"] = pg.polygons(pg.get_exterior_ring(wb_geom.geometry)) print("Validating waterbodies...") ix = ~pg.is_valid(wb_geom.waterbody) invalid_count = ix.sum() if invalid_count: print("{:,} invalid waterbodies found, repairing...".format(invalid_count)) # Buffer by 0 to fix # TODO: may need to do this by a small fraction and simplify instead repair_start = time() wb_geom.loc[ix, "waterbody"] = pg.buffer(wb_geom.loc[ix].waterbody, 0) waterbodies.loc[ix, "geometry"] = wb_geom.loc[ix].waterbody print("Repaired geometry in {:.2f}s".format(time() - repair_start)) # Set indices and create combined geometry object for analysis wb_joins = wb_joins.set_index(["lineID", "wbID"]) geoms = wb_joins.join(fl_geom, how="inner").join(wb_geom.waterbody) ### Find contained geometries print( "Identifying flowlines completely within waterbodies out of {:,} flowline / waterbody combinations...".format( len(geoms) ) ) contained_start = time() geoms["inside"] = pg.contains(geoms.waterbody.values, geoms.geometry.values) print( "Identified {:,} flowlines completely contained by waterbodies in {:.2f}s".format( geoms.inside.sum(), time() - contained_start ) ) # Check for logic errors - no flowline should be completely contained by more than 1 waterbody errors = geoms.groupby(level=[0]).inside.sum().astype("uint8") > 1 if errors.max(): # this most likely indicates duplicate waterbodies, which should have been resolved before this print( "ERROR: major logic error - some flowlines claim to be completely contained by multiple waterbodies" ) print( "===> error flowlines written to {}/contained_errors.feather".format( out_dir ) ) to_geofeather( flowlines.loc[flowlines.index.isin(errors)], out_dir / "contained_errors.feather", crs=CRS, ) ### Check those that aren't contained to see if they cross print("Determining which flowlines actually cross into waterbodies...") cross_start = time() geoms = geoms.loc[~geoms.inside].copy() geoms["crosses"] = pg.crosses(geoms.geometry, geoms.waterbody) outside = geoms.loc[~(geoms["crosses"] | geoms.inside)].index # keep the ones that cross for further processing geoms = geoms.loc[geoms.crosses].copy() print( "Identified {:,} flowlines completely outside waterbodies and {:,} flowlines that cross waterbody boundaries in {:.2f}s".format( len(outside), len(geoms), time() - cross_start ) ) # Any that do not cross and are not completely within waterbodies should be dropped now # Can only drop joins by BOTH lineID and wbID (the index here) # Also drop associated waterbodies that no longer have joins wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() # FIXME: for closely adjacent waterbodies, these are important to keep # Need to cut them by their multiple polys, update their joins, and feed back into following analysis # pg.intersection_all might work here # check for multiple crossings - these are errors from NHD that we can drop from here errors = geoms.groupby(level=0).size() > 1 if errors.max(): print( "Found {:,} flowlines that cross multiple waterbodies. These are bad data and will be dropped from waterbody intersection.".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_multiple.feather", crs=CRS, ) # completely remove the flowlines from intersections and drop the waterbodies wb_joins = wb_joins.loc[ ~wb_joins.index.get_level_values(0).isin(errors.loc[errors].index) ].copy() waterbodies = waterbodies.loc[ waterbodies.index.isin(wb_joins.index.get_level_values(1)) ].copy() geoms = geoms.loc[geoms.index.isin(wb_joins.index)].copy() print("Calculating geometric intersection of flowlines and waterbodies...") int_start = time() geoms = geoms[["geometry", "waterbody"]].join(flowlines.length.rename("origLength")) # First, calculate the geometric intersection between the lines and waterbodies # WARNING: this intersection may return LineString, MultiLineString, Point, GeometryCollection geoms["intersection"] = pg.intersection(geoms.geometry, geoms.waterbody) types = pg.get_type_id(geoms.intersection) # NOTE: all the points should be captured by the above logic for crosses is_point = types.isin([0, 4]) is_line = types.isin([1, 5]) others = types[~(is_point | is_line)].unique() # GeometryCollection indicates a mess, skip those if len(others): print( "WARNING: Found other types of geometric intersection: {} (n={:,}), these will be dropped".format( others, len(types[~(is_point | is_line)]) ) ) # Any that intersect only at a point are OUTSIDE outside = geoms.loc[is_point].index # TODO: confirm this works wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() print("Identified {:,} more flowlines outside waterbodies".format(len(outside))) # Drop those that are not lines from further analysis geoms = geoms.loc[is_line].copy() # Inspect amount of overlay - if the intersected length is within 1m of final length, it is completely within # if it is near 0, it is completely outside geoms["length"] = pg.length(geoms.intersection) outside = geoms.length < 1 inside = (geoms.origLength - geoms.length).abs() < 1 print( "Found {:,} more completely outside, {:,} completely inside".format( outside.sum(), inside.sum() ) ) # drop the ones that are outside wb_joins = wb_joins.loc[~wb_joins.index.isin(outside[outside].index)].copy() # cut the ones that aren't completely inside or outside geoms = geoms.loc[~(inside | outside)].copy() print("Done evaluating intersection in {:.2f}s".format(time() - int_start)) if len(geoms): print("Cutting {:,} flowlines ...".format(len(geoms))) cut_start = time() geoms = geoms[["geometry", "waterbody", "origLength"]] # WARNING: difference is not precise, the point of split is not exactly at the intersection between lines # but within some tolerance. This will cause them to fail the contains() test below. boundary = pg.boundary(geoms.waterbody) geoms["geometry"] = pg.difference(geoms.geometry, boundary) errors = ~pg.is_valid(geoms.geometry) if errors.max(): print("WARNING: geometry errors for {:,} cut lines".format(errors.sum())) length = pg.length(geoms.geometry) errors = (length - geoms.origLength).abs() > 1 if errors.max(): print( "WARNING: {:,} lines were not completely cut by waterbodies (maybe shared edge?).\nThese will not be cut".format( errors.sum() ) ) to_geofeather( flowlines.loc[ errors.loc[errors].index.get_level_values(0).unique() ].reset_index(), out_dir / "error_incomplete_cut.feather", crs=CRS, ) # remove these from the cut geoms and retain their originals geoms = geoms.loc[~errors].copy() # Explode the multilines into single line segments geoms["geometry"] = explode(geoms.geometry) geoms = geoms.explode("geometry") # mark those parts of the cut lines that are within waterbodies # WARNING: this is not capturing all that should be inside after cutting! geoms["iswithin"] = pg.contains(geoms.waterbody, geoms.geometry) errors = geoms.groupby(level=0).iswithin.max() == False if errors.max(): print( "WARNING: {:,} flowlines that cross waterbodies had no parts contained within those waterbodies".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_but_not_contained.feather", crs=CRS, ) # If they cross, assume they are within print("Attempting to correct these based on which ones cross") ix = geoms.loc[ geoms.index.get_level_values(0).isin(errors.loc[errors].index) ].index geoms.loc[ix, "iswithin"] = pg.crosses( geoms.loc[ix].geometry, geoms.loc[ix].waterbody ) errors = geoms.groupby(level=0).iswithin.max() == False print("{:,} still have no part in a waterbody".format(errors.sum())) # calculate total length of within and outside parts geoms["length"] = pg.length(geoms.geometry) # drop any new segments that are < 1m, these are noise print("Dropping {:,} new segments < 1m".format((geoms.length < 1).sum())) geoms = geoms.loc[geoms.length >= 1].copy() if len(geoms) > 1: length = geoms.groupby(["lineID", "wbID", "iswithin"]).agg( {"length": "sum", "origLength": "first"} ) # Anything within 1 meter of original length is considered unchanged # This is so that we ignore slivers length["unchanged"] = (length.origLength - length["length"]).abs() < 1 unchanged = ( length[["unchanged"]] .reset_index() .groupby(["lineID", "wbID"]) .unchanged.max() .rename("max_unchanged") ) unchanged = ( length.reset_index().set_index(["lineID", "wbID"]).join(unchanged) ) is_within = ( unchanged.loc[unchanged.max_unchanged] .reset_index() .set_index(["lineID", "wbID"]) .iswithin ) # For any that are unchanged and NOT within waterbodies, # remove them from wb_joins ix = is_within.loc[~is_within].index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # Remove any that are unchanged from intersection analysis geoms = geoms.loc[~geoms.index.isin(is_within.index)].copy() print( "Created {:,} new flowlines by splitting {:,} flowlines at waterbody edges in {:.2f}".format( len(geoms), len(geoms.index.get_level_values(0).unique()), time() - cut_start, ) ) if len(geoms) > 1: ### These are our final new lines to add # remove their lineIDs from flowlines and append # replace their outer joins to these ones and add intermediates # Join in previous line information from flowlines new_lines = ( geoms[["geometry", "length", "iswithin"]] .reset_index() .set_index("lineID") .join(flowlines.drop(columns=["geometry", "length", "sinuosity"])) .reset_index() .rename(columns={"lineID": "origLineID", "iswithin": "waterbody"}) ) error = ( new_lines.groupby("origLineID").wbID.unique().apply(len).max() > 1 ) if error: # Watch for errors - if a flowline is cut by multiple waterbodies # there will be problems with our logic for splicing in new lines # also - our intersection logic above is wrong print( """\n========\n MAJOR LOGIC ERROR: multiple waterbodies associated with a single flowline that as been cut. \n========\n """ ) # recalculate length and sinuosity new_lines["length"] = pg.length(new_lines.geometry).astype("float32") new_lines["sinuosity"] = calculate_sinuosity(new_lines.geometry).astype( "float32" ) # calculate new IDS next_segment_id = int(flowlines.index.max() + 1) new_lines["lineID"] = next_segment_id + new_lines.index new_lines.lineID = new_lines.lineID.astype("uint32") ### Update waterbody joins # remove joins replaced by above ix = new_lines.set_index(["origLineID", "wbID"]).index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # add new joins wb_joins = ( wb_joins.reset_index() .append( new_lines.loc[new_lines.waterbody, ["lineID", "wbID"]], ignore_index=True, sort=False, ) .set_index(["lineID", "wbID"]) ) ### Update flowline joins # transform new lines to create new joins l = new_lines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created joins = update_joins( joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) ### Create new line joins for any that weren't inserted above # Transform all groups of new line IDs per original lineID, wbID # into joins structure pairs = lambda a: pd.Series(zip(a[:-1], a[1:])) new_joins = ( new_lines.groupby(["origLineID", "wbID"]) .lineID.apply(pairs) .apply(pd.Series) .reset_index() .rename(columns={0: "upstream_id", 1: "downstream_id"}) .join( flowlines[["NHDPlusID", "loop"]].rename( columns={"NHDPlusID": "upstream"} ), on="origLineID", ) ) # NHDPlusID is same for both sides new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" new_joins = new_joins[ [ "upstream", "downstream", "upstream_id", "downstream_id", "type", "loop", ] ] joins = joins.append( new_joins, ignore_index=True, sort=False ).sort_values(["downstream_id", "upstream_id"]) ### Update flowlines # remove originals now replaced by cut versions here flowlines = ( flowlines.loc[~flowlines.index.isin(new_lines.origLineID)] .reset_index() .append( new_lines[["lineID"] + list(flowlines.columns) + ["waterbody"]], ignore_index=True, sort=False, ) .sort_values("lineID") .set_index("lineID") ) # End cut geometries # Update waterbody bool for other flowlines based on those that completely intersected # above flowlines.loc[ flowlines.index.isin(wb_joins.index.get_level_values(0).unique()), "waterbody" ] = True flowlines.waterbody = flowlines.waterbody.fillna(False) ### Update waterbodies and calculate flowline stats wb_joins = wb_joins.reset_index() stats = ( wb_joins.join(flowlines.length.rename("flowlineLength"), on="lineID") .groupby("wbID") .flowlineLength.sum() .astype("float32") ) waterbodies = waterbodies.loc[waterbodies.index.isin(wb_joins.wbID)].join(stats) print("Done cutting flowlines by waterbodies in {:.2f}s".format(time() - start)) return flowlines, joins, waterbodies, wb_joins
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, next_lineID): """ Cut lines by waterbodies. 1. Finds all intersections between waterbodies and flowlines. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from wb_joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame next_lineID : int next lineID; must be greater than all prior lines in region Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() ### Find flowlines that intersect waterbodies join_start = time() tree = pg.STRtree(flowlines.geometry.values.data) left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": flowlines.index.take(right), "flowline": flowlines.geometry.values.data.take(right), "wbID": waterbodies.index.take(left), "waterbody": waterbodies.geometry.values.data.take(left), }) print( f"Found {len(df):,} waterbody / flowline joins in {time() - join_start:.2f}s" ) ### Find those that are completely contained; these don't need further processing pg.prepare(df.waterbody.values) # find those that are fully contained and do not touch the edge of the waterbody (contains_properly predicate) # contains_properly is very fast contained_start = time() df["contains"] = pg.contains_properly(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} flowlines fully within waterbodies in {time() - contained_start:.2f}s" ) # find those that aren't fully contained by contained and touch the edge of waterbody (contains predicate) contained_start = time() ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "contains"] = pg.contains(tmp.waterbody, tmp.flowline) print( f"Identified {df.loc[ix].contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # Sanity check: flowlines should only ever be contained by one waterbody if df.loc[df.contains].groupby("lineID").size().max() > 1: raise ValueError( "ERROR: one or more lines contained by multiple waterbodies") # for any that are not completely contained, find the ones that overlap crosses_start = time() df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) print( f"Identified {df.crosses.sum():,} flowlines that cross edge of waterbodies in {time() - crosses_start:.2f}s" ) # discard any that only touch (ones that don't cross or are contained) # note that we only cut the ones that cross below; contained ones are left intact df = df.loc[df.contains | df.crosses].copy() print("Intersecting flowlines and waterbodies...") cut_start = time() ix = df.crosses tmp = df.loc[ix] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # Cut lines that are long enough and different enough from the original lines df["to_cut"] = False tmp = df.loc[df.crosses] keep = (tmp.crosses & (tmp.length >= CUT_TOLERANCE) & ((tmp.flength - tmp.length).abs() >= CUT_TOLERANCE)) df.loc[keep[keep].index, "to_cut"] = True df["inside"] = (df.length / df.flength).clip(0, 1) print( f"Found {df.to_cut.sum():,} segments that need to be cut by flowlines in {time() - cut_start:.2f}s" ) # save all that are completely contained or mostly contained. # They must be at least 50% in waterbody to be considered mostly contained. # Note: there are some that are mostly outside and we exclude those here. # We then update this after cutting contained = df.loc[df.inside >= 0.5, ["wbID", "lineID"]].copy() ### Cut lines if df.to_cut.sum(): # only work with those to cut from here on out df = df.loc[df.to_cut, ["lineID", "flowline", "wbID", "waterbody"]].reset_index( drop=True) # save waterbody ids to re-evaluate intersection after cutting wbID = df.wbID.unique() # extract all intersecting interior rings for these waterbodies print("Extracting interior rings for intersected waterbodies") wb = waterbodies.loc[waterbodies.index.isin(wbID)] outer_index, inner_index, rings = get_interior_rings( wb.geometry.values.data) if len(outer_index): # find the pairs of waterbody rings and lines to add rings = np.asarray(rings) wb_with_rings = wb.index.values.take(outer_index) lines_in_wb = df.loc[df.wbID.isin(wb_with_rings)].lineID.unique() lines_in_wb = flowlines.loc[flowlines.index.isin( lines_in_wb)].geometry tree = pg.STRtree(rings) left, right = tree.query_bulk(lines_in_wb.values.data, predicate="intersects") tmp = pd.DataFrame({ "lineID": lines_in_wb.index.values.take(left), "flowline": lines_in_wb.values.data.take(left), "wbID": wb_with_rings.take(right), "waterbody": rings.take(right), }) df = df.append(tmp, ignore_index=True, sort=False) # extract the outer ring for original waterbodies ix = pg.get_type_id(df.waterbody.values.data) == 3 df.loc[ix, "waterbody"] = pg.get_exterior_ring( df.loc[ix].waterbody.values.data) # Calculate all geometric intersections between the flowlines and # waterbody rings and drop any that are not points # Note: these may be multipoints where line crosses the ring of waterbody # multiple times. # We ignore any shared edges, etc that result from the intersection; those # aren't helpful for cutting the lines print("Finding cut points...") df["geometry"] = pg.intersection(df.flowline.values, df.waterbody.values) df = explode( explode( gp.GeoDataFrame(df[["geometry", "lineID", "flowline"]], crs=flowlines.crs))).reset_index() points = (df.loc[pg.get_type_id(df.geometry.values.data) == 0].set_index("lineID").geometry) print("cutting flowlines") cut_start = time() flowlines, joins = cut_flowlines_at_points(flowlines, joins, points, next_lineID=next_lineID) new_flowlines = flowlines.loc[flowlines.new] print( f"{len(new_flowlines):,} new flowlines created in {time() - cut_start:,.2f}s" ) if len(new_flowlines): # remove any flowlines no longer present (they were replaced by cut lines) contained = contained.loc[contained.lineID.isin( flowlines.loc[~flowlines.new].index.unique())].copy() contained_start = time() # recalculate overlaps with waterbodies print("Recalculating overlaps with waterbodies") wb = waterbodies.loc[wbID] tree = pg.STRtree(new_flowlines.geometry.values.data) left, right = tree.query_bulk(wb.geometry.values.data, predicate="intersects") df = pd.DataFrame({ "lineID": new_flowlines.index.take(right), "flowline": new_flowlines.geometry.values.data.take(right), "wbID": wb.index.take(left), "waterbody": wb.geometry.values.data.take(left), }) pg.prepare(df.waterbody.values) df["contains"] = pg.contains(df.waterbody.values, df.flowline.values) print( f"Identified {df.contains.sum():,} more flowlines contained by waterbodies in {time() - contained_start:.2f}s" ) # some aren't perfectly contained, add those that are mostly in df["crosses"] = False ix = ~df.contains tmp = df.loc[ix] df.loc[ix, "crosses"] = pg.crosses(tmp.waterbody, tmp.flowline) # discard any that only touch (don't cross or are contained) df = df.loc[df.contains | df.crosses].copy() tmp = df.loc[df.crosses] df["geometry"] = df.flowline # use intersection to cut flowlines by waterbodies. Note: this may produce # nonlinear (e.g., geom collection) results df.loc[ix, "geometry"] = pg.intersection(tmp.flowline, tmp.waterbody) df["length"] = pg.length(df.geometry) df["flength"] = pg.length(df.flowline) # keep any that are contained or >= 50% in waterbody contained = contained.append( df.loc[df.contains | ((df.length / df.flength) >= 0.5), ["wbID", "lineID"]], ignore_index=True, ) flowlines = flowlines.drop(columns=["new"]) # make sure that updated joins are unique joins = joins.drop_duplicates() # make sure that wb_joins is unique contained = contained.groupby(by=["lineID", "wbID"]).first().reset_index() # set flag for flowlines in waterbodies flowlines["waterbody"] = flowlines.index.isin(contained.lineID.unique()) print("Done evaluating waterbody / flowline overlap in {:.2f}s".format( time() - start)) return flowlines, joins, contained
huc4_df = huc4_df.loc[huc4_df.HUC2.isin(huc2)].to_crs(CRS).reset_index( drop=True) # Extract HUC4s that intersect tree = pg.STRtree(huc4_df.geometry.values.data) ix = tree.query(bnd, predicate="intersects") huc4_df = huc4_df.iloc[ix].reset_index(drop=True) # Drop any that are at the edges only and have little overlap tree = pg.STRtree(huc4_df.geometry.values.data) contains_ix = tree.query(bnd, predicate="contains") edge_ix = np.setdiff1d(np.arange(len(huc4_df)), contains_ix) # clip geometries by bnd edge_df = huc4_df.iloc[edge_ix].reset_index(drop=True) edge_df["clipped"] = pg.intersection(bnd, edge_df.geometry.values.data) edge_df["overlap_pct"] = (100 * pg.area(edge_df.clipped.values.data) / pg.area(edge_df.geometry.values.data)) # keep areas that overlap by >= 1% huc4 = np.unique( np.append(huc4_df.iloc[contains_ix].HUC4, edge_df.loc[edge_df.overlap_pct >= 1].HUC4)) huc4_df = huc4_df.loc[huc4_df.HUC4.isin(huc4)].reset_index(drop=True) write_dataframe(huc4_df, out_dir / "huc4.gpkg") huc4_df.to_feather(out_dir / "huc4.feather") # repeat for SARP tree = pg.STRtree(huc4_df.geometry.values.data) intersects_ix = tree.query(sarp_bnd, predicate="intersects") contains_ix = tree.query(sarp_bnd, predicate="contains")
def time_clip_by_box(self): pygeos.intersection(self.polygon, self.boxes)
# select out those within the SA boundary print("Selecting HUC12s in region...") tree = pg.STRtree(huc12.geometry.values.data) ix = tree.query(bnd, predicate="intersects") huc12 = huc12.iloc[ix].copy().reset_index(drop=True) huc12["acres"] = (pg.area(huc12.geometry.values.data) * M2_ACRES).round().astype("uint") # for those at the edge, only keep the ones with > 50% in the extent tree = pg.STRtree(huc12.geometry.values.data) contains_ix = tree.query(bnd, predicate="contains") edge_ix = np.setdiff1d(huc12.index, contains_ix) overlap = pg.area( pg.intersection(huc12.iloc[edge_ix].geometry.values.data, bnd)) / pg.area( huc12.iloc[edge_ix].geometry.values.data) keep_ix = np.append(contains_ix, edge_ix[overlap >= 0.5]) keep_ix.sort() huc12 = huc12.iloc[keep_ix].copy() huc12_wgs84 = huc12.to_crs(GEO_CRS) huc12 = huc12.join(huc12_wgs84.bounds) huc12.to_feather(analysis_dir / "huc12.feather") write_dataframe(huc12, bnd_dir / "huc12.gpkg", driver="GPKG") ### Marine units (already in EPSG:5070) print("Reading marine blocks...") marine = read_dataframe(src_dir / "summary_units/marine_blocks_prj.shp")[[
def enclosures(primary_barriers, limit=None, additional_barriers=None, enclosure_id="eID"): """ Generate enclosures based on passed barriers. Enclosures are areas enclosed from all sides by at least one type of a barrier. Barriers are typically roads, railways, natural features like rivers and other water bodies or coastline. Enclosures are a result of polygonization of the ``primary_barrier`` and ``limit`` and its subdivision based on additional_barriers. Parameters ---------- primary_barriers : GeoDataFrame, GeoSeries GeoDataFrame or GeoSeries containing primary barriers. (Multi)LineString geometry is expected. limit : GeoDataFrame, GeoSeries (default None) GeoDataFrame or GeoSeries containing external limit of enclosures, i.e. the area which gets partitioned. If None is passed, the internal area of ``primary_barriers`` will be used. additional_barriers : GeoDataFrame GeoDataFrame or GeoSeries containing additional barriers. (Multi)LineString geometry is expected. enclosure_id : str (default 'eID') name of the enclosure_id (to be created). Returns ------- enclosures : GeoDataFrame GeoDataFrame containing enclosure geometries and enclosure_id Examples -------- >>> enclosures = mm.enclosures(streets, admin_boundary, [railway, rivers]) """ if limit is not None: if limit.geom_type.isin(["Polygon", "MultiPolygon"]).any(): limit = limit.boundary barriers = pd.concat([primary_barriers.geometry, limit.geometry]) else: barriers = primary_barriers unioned = barriers.unary_union polygons = polygonize(unioned) enclosures = gpd.GeoSeries(list(polygons), crs=primary_barriers.crs) if additional_barriers is not None: if not isinstance(additional_barriers, list): raise TypeError( "`additional_barriers` expects a list of GeoDataFrames or GeoSeries." f"Got {type(additional_barriers)}.") additional = pd.concat([gdf.geometry for gdf in additional_barriers]) inp, res = enclosures.sindex.query_bulk(additional.geometry, predicate="intersects") unique = np.unique(res) new = [] for i in unique: poly = enclosures.values.data[i] # get enclosure polygon crossing = inp[res == i] # get relevant additional barriers buf = pygeos.buffer(poly, 0.01) # to avoid floating point errors crossing_ins = pygeos.intersection( buf, additional.values.data[crossing] ) # keeping only parts of additional barriers within polygon union = pygeos.union_all( np.append(crossing_ins, pygeos.boundary(poly))) # union polygons = np.array(list(polygonize( _pygeos_to_shapely(union)))) # polygonize within = pygeos.covered_by( pygeos.from_shapely(polygons), buf) # keep only those within original polygon new += list(polygons[within]) final_enclosures = (gpd.GeoSeries(enclosures).drop(unique).append( gpd.GeoSeries(new)).reset_index(drop=True)).set_crs( primary_barriers.crs) return gpd.GeoDataFrame({enclosure_id: range(len(final_enclosures))}, geometry=final_enclosures) return gpd.GeoDataFrame({enclosure_id: range(len(enclosures))}, geometry=enclosures)
def remove_marine_flowlines(flowlines, joins, marine): """Remove flowlines that originate within or are mostly within marine areas for coastal HUC2s. Marks any that have endpoints in marine areas or are upstream of those removed here as terminating in marine. Parameters ---------- flowlines : GeoDataFrame joins : DataFrame marine : GeoDataFrame Returns ------- (GeoDataFrame, DataFrame) flowlines, joins """ # Remove those that start in marine areas points = pg.get_point(flowlines.geometry.values.data, 0) tree = pg.STRtree(points) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) print(f"Removing {len(ix):,} flowlines that originate in marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") # Mark those that end in marine areas as marine endpoints = pg.get_point(flowlines.geometry.values.data, -1) tree = pg.STRtree(endpoints) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) joins.loc[joins.upstream_id.isin(ix), "marine"] = True # For any that end in marine but didn't originate there, check the amount of overlap; # any that are >= 90% in marine should get cut print("Calculating overlap of remaining lines with marine areas") tmp = pd.DataFrame({ "lineID": flowlines.iloc[right].index, "geometry": flowlines.iloc[right].geometry.values.data, "marine": marine.iloc[left].geometry.values.data, }) tmp["overlap"] = pg.intersection(tmp.geometry, tmp.marine) tmp["pct_overlap"] = 100 * pg.length(tmp.overlap) / pg.length(tmp.geometry) ix = tmp.loc[tmp.pct_overlap >= 90].lineID.unique() print(f"Removing {len(ix):,} flowlines that mostly overlap marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") return flowlines, joins