def annotate_vector( vector_path: str, raster_paths: list, labels: list = None, drop_na: bool = True, ) -> gpd.GeoDataFrame: """Reads and stores pixel values from rasters using a point-format vector file. Args: vector_path: path to a vector file (shp, geojson, etc) raster_paths: raster paths to extract pixel values from labels: band name labels. should match the total number of bands across all raster_paths drop_na: drop all records with no-data values Returns: gdf: GeoDataFrame annotated with the pixel values from each raster """ # format the inputs raster_paths = to_iterable(raster_paths) labels = format_band_labels(raster_paths, labels) gdf = gpd.read_file(vector_path) raster_df = annotate_geoseries(gdf.geometry, raster_paths, labels, drop_na) gdf = pd.concat( [gdf, raster_df.drop(["geometry"], axis=1, errors="ignore")], axis=1) return gdf
def annotate( points: Union[str, gpd.GeoSeries, gpd.GeoDataFrame], raster_paths: Union[str, list], labels: list = None, drop_na: bool = True, ): """Read raster values for each point in a vector and append as new columns. Args: points: path to a point-format vector, OR GeoDataFrame with point locations, OR GeoSeries (e.g., gdf['geometry']) with point locations raster_paths: raster paths to extract pixel values from. labels: band name labels. number of labels should match the total number of bands across all raster_paths. drop_na: drop all records with no-data values. Returns: gdf: GeoDataFrame annotated with the pixel values from each raster """ # format the inputs raster_paths = to_iterable(raster_paths) labels = format_band_labels(raster_paths, labels) # read raster values based on the points dtype if isinstance(points, gpd.GeoSeries): gdf = annotate_geoseries( points, raster_paths, labels=labels, drop_na=drop_na, ) elif isinstance(points, gpd.GeoDataFrame) or isinstance( points, pd.DataFrame): gdf = annotate_geoseries( points.geometry, raster_paths, labels=labels, drop_na=drop_na, ) # append annotations to the input dataframe gdf = pd.concat( [points, gdf.drop(["geometry"], axis=1, errors="ignore")], axis=1) elif os.path.isfile(points): gdf = annotate_vector(points, raster_paths, labels=labels, drop_na=drop_na) else: raise TypeError( "points arg must be a valid path, GeoDataFrame, or GeoSeries") return gdf
def xy_to_geoseries(x: Union[float, list, np.ndarray], y: Union[float, list, np.ndarray], crs: CRSType = "epsg:4236") -> gpd.GeoSeries: """Converts x/y data into a geopandas geoseries. Args: x: 1-D array-like of x location values y: 1-D array-like of y location values crs: coordinate reference system. accepts pyproj.CRS / rio.crs.CRS objects or anything allowed by pyproj.CRS.from_user_input() Returns: gs: Point geometry geoseries """ # handle single x/y location values x = to_iterable(x) y = to_iterable(y) points = [Point(x, y) for x, y in zip(x, y)] gs = gpd.GeoSeries(points, crs=crs) return gs
def zonal_stats( polygons: Union[gpd.GeoSeries, gpd.GeoDataFrame], raster_paths: list, labels: list = None, all_touched: bool = True, mean: bool = True, stdv: bool = True, min: bool = False, max: bool = False, count: bool = False, sum: bool = False, skew: bool = False, kurtosis: bool = False, mode: bool = False, all: bool = False, percentiles: list = [], ) -> gpd.GeoDataFrame: """Compute raster summary stats for each polygon in a GeoSeries or GeoDataFrame. Args: polygons: GeoSeries or GeoDataFrame with polygon geometries. raster_paths: list of paths to rasters to summarize labels: band labels. must match the total number of bands for all raster_paths. all_touched: include all pixels that touch a polygon. set to False to only include pixels whose centers intersect the polygon mean, min, max, count, sum, stdv, skew, kurtosis, mode: set to True to compute these stats all: compute all of the above stats percentiles: list of 0-100 percentile ranges to compute Returns: GeoDataFrame with zonal stats for each raster band in new columns. If `polygons` is a GeoDataFrame, the zonal stats columns are appended to the original input. """ # format the input geometries validate_gpd(polygons) valid_idx = validate_polygons(polygons) polygons = polygons.iloc[valid_idx] is_df = isinstance(polygons, gpd.GeoDataFrame) polys = polygons.geometry if is_df else polygons # format the input labels raster_paths = to_iterable(raster_paths) labels = format_band_labels(raster_paths, labels) # get the bands and indexes for each covariate raster nbands, band_idx = get_raster_band_indexes(raster_paths) # get the stats methods to compute for each feature stats_methods = get_raster_stats_methods( mean=mean, min=min, max=max, count=count, sum=sum, stdv=stdv, skew=skew, kurtosis=kurtosis, mode=mode, percentiles=percentiles, all=all, ) # create dataframes for each raster and concatenate at the end raster_dfs = [] # run zonal stats raster-by-raster (instead of iterating first over geometries) for r, raster in tqdm(enumerate(raster_paths), total=len(raster_paths), desc="Raster", **tqdm_opts): # format the band labels band_labels = labels[band_idx[r]:band_idx[r + 1]] n_raster_bands = band_idx[r + 1] - band_idx[r] stats_labels = [] for method in stats_methods: stats_labels.append( [f"{band}_{method.name}" for band in band_labels]) # open the raster for reading with rio.open(raster, "r") as src: # reproject the polygon data as necessary if not crs_match(polys.crs, src.crs): polys = polys.to_crs(src.crs) # create output arrays to store each stat's output stats_arrays = [] for method in stats_methods: dtype = method.dtype or src.dtypes[0] stats_arrays.append( np.zeros((len(polys), n_raster_bands), dtype=dtype)) # iterate over each geometry to read data and compute stats for p, poly in tqdm(enumerate(polys), total=len(polys), desc="Polygon", leave=False, **tqdm_opts): data = read_raster_from_polygon(src, poly) for method, array in zip(stats_methods, stats_arrays): array[p, :] = method.reduce(data) # convert each stat's array into dataframes and merge them together stats_dfs = [ pd.DataFrame(array, columns=labels) for array, labels in zip(stats_arrays, stats_labels) ] raster_dfs.append(pd.concat(stats_dfs, axis=1)) # merge the outputs from each raster if is_df: merged = gpd.GeoDataFrame(pd.concat([polygons] + raster_dfs, axis=1), crs=polygons.crs) else: merged = gpd.GeoDataFrame(pd.concat(raster_dfs, axis=1), geometry=polygons, crs=polygons.crs) return merged
def apply_model_to_rasters( model: BaseEstimator, raster_paths: list, output_path: str, resampling: rio.enums.Enum = rio.enums.Resampling.average, count: int = 1, dtype: str = "float32", nodata: float = -9999, driver: str = "GTiff", compress: str = "deflate", bigtiff: bool = True, template_idx: int = 0, windowed: bool = True, predict_proba: bool = False, ignore_sklearn: bool = True, **kwargs, ) -> None: """Applies a trained model to a list of raster datasets. The list and band order of the rasters must match the order of the covariates used to train the model. It reads each dataset block-by-block, applies the model, and writes gridded predictions. If the raster datasets are not consistent (different extents, resolutions, etc.), it wll re-project the data on the fly, with the grid size, extent and projection based on a 'template' raster. Args: model: object with a model.predict() function raster_paths: raster paths of covariates to apply the model to output_path: path to the output file to create resampling: resampling algorithm to apply to on-the-fly reprojection from rasterio.enums.Resampling count: number of bands in the prediction output dtype: the output raster data type nodata: output nodata value driver: output raster format from rasterio.drivers.raster_driver_extensions() compress: compression to apply to the output file bigtiff: specify the output file as a bigtiff (for rasters > 2GB) template_idx: index of the raster file to use as a template. template_idx=0 sets the first raster as template windowed: apply the model using windowed read/write slower, but more memory efficient predict_proba: use model.predict_proba() instead of model.predict() ignore_sklearn: silence sklearn warning messages **kwargs: additonal keywords to pass to model.predict() For MaxentModels, this would include transform="logistic" Returns: None: saves model predictions to disk. """ # make sure the raster_paths are iterable raster_paths = to_iterable(raster_paths) # get and set template parameters windows, dst_profile = create_output_raster_profile( raster_paths, template_idx, count=count, windowed=windowed, nodata=nodata, compress=compress, driver=driver, bigtiff=bigtiff, ) # get the bands and indexes for each covariate raster nbands, band_idx = get_raster_band_indexes(raster_paths) # check whether the raster paths are aligned to determine how the data are read aligned = check_raster_alignment(raster_paths) # set a dummy nodata variable if none is set # (acutal nodata reads handled by rasterios src.read(masked=True) method) nodata = nodata or 0 # turn off sklearn warnings if ignore_sklearn: warnings.filterwarnings("ignore", category=UserWarning) # open all rasters to read from later srcs = [rio.open(raster_path) for raster_path in raster_paths] # use warped VRT reads to align all rasters pixel-pixel if not aligned if not aligned: vrt_options = { "resampling": resampling, "transform": dst_profile["transform"], "crs": dst_profile["crs"], "height": dst_profile["height"], "width": dst_profile["width"], } srcs = [rio.vrt.WarpedVRT(src, **vrt_options) for src in srcs] # read and reproject blocks from each data source and write predictions to disk with rio.open(output_path, "w", **dst_profile) as dst: for window in tqdm(windows, desc="Window", **tqdm_opts): # create stacked arrays to handle multi-raster, multi-band inputs # that may have different nodata locations covariates = np.zeros((nbands, window.height, window.width), dtype=np.float32) nodata_idx = np.ones_like(covariates, dtype=bool) try: for i, src in enumerate(srcs): data = src.read(window=window, masked=True) covariates[band_idx[i]:band_idx[i + 1]] = data nodata_idx[band_idx[i]:band_idx[i + 1]] = data.mask # skip blocks full of no-data if data.mask.all(): raise NoDataException() predictions = apply_model_to_array( model, covariates, nodata, nodata_idx, count=count, dtype=dtype, predict_proba=predict_proba, **kwargs, ) dst.write(predictions, window=window) except NoDataException: continue
def annotate_geoseries(points: gpd.GeoSeries, raster_paths: list, labels: list = None, drop_na: bool = True, dtype: str = None) -> gpd.GeoDataFrame: """Reads and stores pixel values from rasters using point locations. Args: points: GeoSeries with point locations. raster_paths: rasters to extract pixel values from. labels: band labels. must match the total number of bands for all raster_paths. drop_na: drop records with no-data values. dtype: output column data type. uses the first raster's dtype by default. Returns: gdf: GeoDataFrame annotated with the pixel values from each raster """ # format the inputs raster_paths = to_iterable(raster_paths) labels = format_band_labels(raster_paths, labels) # get the dataset dimensions n_rasters = len(raster_paths) n_points = len(points) # create arrays and flags for updating raster_values = [] valid_idxs = [] nodata_flag = False # annotate each point with the pixel values for each raster for raster_idx, raster_path in tqdm(enumerate(raster_paths), desc="Raster", total=n_rasters, **tqdm_opts): with rio.open(raster_path, "r") as src: # reproject points to match raster and convert to a dataframe if not crs_match(points.crs, src.crs): points.to_crs(src.crs, inplace=True) # use the first rasters dtype for the output array if not set if raster_idx == 0 and dtype is None: dtype = src.dtypes[0] # get the raster row/col indices for each point and the respective read windows xys = [(point.x, point.y) for point in points] # read each pixel value samples = src.sample(xys, masked=False) # assign to an output array outarr = np.zeros((n_points, src.count), dtype=dtype) for idx, sample in enumerate(samples): outarr[idx] = sample # identify nodata points to remove later if drop_na and src.nodata is not None: nodata_flag = True valid_idxs.append(outarr[:, 0] != src.nodata) raster_values.append(outarr) # merge the arrays from each raster values = np.concatenate(raster_values, axis=1, dtype=dtype) if nodata_flag: valid = np.max(valid_idxs, axis=0) values = values[valid, :] points = points.iloc[valid] points.index = range(valid.sum()) # convert to a geodataframe gdf = gpd.GeoDataFrame(values, geometry=points.geometry, columns=labels) return gdf