Esempio n. 1
0
def annotate_vector(
    vector_path: str,
    raster_paths: list,
    labels: list = None,
    drop_na: bool = True,
) -> gpd.GeoDataFrame:
    """Reads and stores pixel values from rasters using a point-format vector file.

    Args:
        vector_path: path to a vector file (shp, geojson, etc)
        raster_paths: raster paths to extract pixel values from
        labels: band name labels. should match the total number of bands across all raster_paths
        drop_na: drop all records with no-data values

    Returns:
        gdf: GeoDataFrame annotated with the pixel values from each raster
    """
    # format the inputs
    raster_paths = to_iterable(raster_paths)
    labels = format_band_labels(raster_paths, labels)

    gdf = gpd.read_file(vector_path)
    raster_df = annotate_geoseries(gdf.geometry, raster_paths, labels, drop_na)
    gdf = pd.concat(
        [gdf, raster_df.drop(["geometry"], axis=1, errors="ignore")], axis=1)
    return gdf
Esempio n. 2
0
def annotate(
    points: Union[str, gpd.GeoSeries, gpd.GeoDataFrame],
    raster_paths: Union[str, list],
    labels: list = None,
    drop_na: bool = True,
):
    """Read raster values for each point in a vector and append as new columns.

    Args:
        points: path to a point-format vector, OR
            GeoDataFrame with point locations, OR
            GeoSeries (e.g., gdf['geometry']) with point locations
        raster_paths: raster paths to extract pixel values from.
        labels: band name labels. number of labels should match the
            total number of bands across all raster_paths.
        drop_na: drop all records with no-data values.

    Returns:
        gdf: GeoDataFrame annotated with the pixel values from each raster
    """
    # format the inputs
    raster_paths = to_iterable(raster_paths)
    labels = format_band_labels(raster_paths, labels)

    # read raster values based on the points dtype
    if isinstance(points, gpd.GeoSeries):
        gdf = annotate_geoseries(
            points,
            raster_paths,
            labels=labels,
            drop_na=drop_na,
        )

    elif isinstance(points, gpd.GeoDataFrame) or isinstance(
            points, pd.DataFrame):
        gdf = annotate_geoseries(
            points.geometry,
            raster_paths,
            labels=labels,
            drop_na=drop_na,
        )

        # append annotations to the input dataframe
        gdf = pd.concat(
            [points, gdf.drop(["geometry"], axis=1, errors="ignore")], axis=1)

    elif os.path.isfile(points):
        gdf = annotate_vector(points,
                              raster_paths,
                              labels=labels,
                              drop_na=drop_na)

    else:
        raise TypeError(
            "points arg must be a valid path, GeoDataFrame, or GeoSeries")

    return gdf
Esempio n. 3
0
def xy_to_geoseries(x: Union[float, list, np.ndarray],
                    y: Union[float, list, np.ndarray],
                    crs: CRSType = "epsg:4236") -> gpd.GeoSeries:
    """Converts x/y data into a geopandas geoseries.

    Args:
        x: 1-D array-like of x location values
        y: 1-D array-like of y location values
        crs: coordinate reference system. accepts pyproj.CRS / rio.crs.CRS objects
            or anything allowed by pyproj.CRS.from_user_input()

    Returns:
        gs: Point geometry geoseries
    """
    # handle single x/y location values
    x = to_iterable(x)
    y = to_iterable(y)

    points = [Point(x, y) for x, y in zip(x, y)]
    gs = gpd.GeoSeries(points, crs=crs)

    return gs
Esempio n. 4
0
def zonal_stats(
    polygons: Union[gpd.GeoSeries, gpd.GeoDataFrame],
    raster_paths: list,
    labels: list = None,
    all_touched: bool = True,
    mean: bool = True,
    stdv: bool = True,
    min: bool = False,
    max: bool = False,
    count: bool = False,
    sum: bool = False,
    skew: bool = False,
    kurtosis: bool = False,
    mode: bool = False,
    all: bool = False,
    percentiles: list = [],
) -> gpd.GeoDataFrame:
    """Compute raster summary stats for each polygon in a GeoSeries or GeoDataFrame.

    Args:
        polygons: GeoSeries or GeoDataFrame with polygon geometries.
        raster_paths: list of paths to rasters to summarize
        labels: band labels. must match the total number of bands for all raster_paths.
        all_touched: include all pixels that touch a polygon.
            set to False to only include pixels whose centers intersect the polygon
        mean, min, max, count, sum, stdv, skew, kurtosis, mode:
            set to True to compute these stats
        all: compute all of the above stats
        percentiles: list of 0-100 percentile ranges to compute

    Returns:
        GeoDataFrame with zonal stats for each raster band in new columns.
            If `polygons` is a GeoDataFrame, the zonal stats columns are appended
            to the original input.
    """
    # format the input geometries
    validate_gpd(polygons)
    valid_idx = validate_polygons(polygons)
    polygons = polygons.iloc[valid_idx]
    is_df = isinstance(polygons, gpd.GeoDataFrame)
    polys = polygons.geometry if is_df else polygons

    # format the input labels
    raster_paths = to_iterable(raster_paths)
    labels = format_band_labels(raster_paths, labels)

    # get the bands and indexes for each covariate raster
    nbands, band_idx = get_raster_band_indexes(raster_paths)

    # get the stats methods to compute for each feature
    stats_methods = get_raster_stats_methods(
        mean=mean,
        min=min,
        max=max,
        count=count,
        sum=sum,
        stdv=stdv,
        skew=skew,
        kurtosis=kurtosis,
        mode=mode,
        percentiles=percentiles,
        all=all,
    )

    # create dataframes for each raster and concatenate at the end
    raster_dfs = []

    # run zonal stats raster-by-raster (instead of iterating first over geometries)
    for r, raster in tqdm(enumerate(raster_paths),
                          total=len(raster_paths),
                          desc="Raster",
                          **tqdm_opts):

        # format the band labels
        band_labels = labels[band_idx[r]:band_idx[r + 1]]
        n_raster_bands = band_idx[r + 1] - band_idx[r]
        stats_labels = []
        for method in stats_methods:
            stats_labels.append(
                [f"{band}_{method.name}" for band in band_labels])

        # open the raster for reading
        with rio.open(raster, "r") as src:

            # reproject the polygon data as necessary
            if not crs_match(polys.crs, src.crs):
                polys = polys.to_crs(src.crs)

            # create output arrays to store each stat's output
            stats_arrays = []
            for method in stats_methods:
                dtype = method.dtype or src.dtypes[0]
                stats_arrays.append(
                    np.zeros((len(polys), n_raster_bands), dtype=dtype))

            # iterate over each geometry to read data and compute stats
            for p, poly in tqdm(enumerate(polys),
                                total=len(polys),
                                desc="Polygon",
                                leave=False,
                                **tqdm_opts):
                data = read_raster_from_polygon(src, poly)
                for method, array in zip(stats_methods, stats_arrays):
                    array[p, :] = method.reduce(data)

        # convert each stat's array into dataframes and merge them together
        stats_dfs = [
            pd.DataFrame(array, columns=labels)
            for array, labels in zip(stats_arrays, stats_labels)
        ]
        raster_dfs.append(pd.concat(stats_dfs, axis=1))

    # merge the outputs from each raster
    if is_df:
        merged = gpd.GeoDataFrame(pd.concat([polygons] + raster_dfs, axis=1),
                                  crs=polygons.crs)
    else:
        merged = gpd.GeoDataFrame(pd.concat(raster_dfs, axis=1),
                                  geometry=polygons,
                                  crs=polygons.crs)

    return merged
Esempio n. 5
0
def apply_model_to_rasters(
    model: BaseEstimator,
    raster_paths: list,
    output_path: str,
    resampling: rio.enums.Enum = rio.enums.Resampling.average,
    count: int = 1,
    dtype: str = "float32",
    nodata: float = -9999,
    driver: str = "GTiff",
    compress: str = "deflate",
    bigtiff: bool = True,
    template_idx: int = 0,
    windowed: bool = True,
    predict_proba: bool = False,
    ignore_sklearn: bool = True,
    **kwargs,
) -> None:
    """Applies a trained model to a list of raster datasets.

    The list and band order of the rasters must match the order of the covariates
    used to train the model. It reads each dataset block-by-block, applies
    the model, and writes gridded predictions. If the raster datasets are not
    consistent (different extents, resolutions, etc.), it wll re-project the data
    on the fly, with the grid size, extent and projection based on a 'template'
    raster.

    Args:
        model: object with a model.predict() function
        raster_paths: raster paths of covariates to apply the model to
        output_path: path to the output file to create
        resampling: resampling algorithm to apply to on-the-fly reprojection
            from rasterio.enums.Resampling
        count: number of bands in the prediction output
        dtype: the output raster data type
        nodata: output nodata value
        driver: output raster format
            from rasterio.drivers.raster_driver_extensions()
        compress: compression to apply to the output file
        bigtiff: specify the output file as a bigtiff (for rasters > 2GB)
        template_idx: index of the raster file to use as a template.
            template_idx=0 sets the first raster as template
        windowed: apply the model using windowed read/write
            slower, but more memory efficient
        predict_proba: use model.predict_proba() instead of model.predict()
        ignore_sklearn: silence sklearn warning messages
        **kwargs: additonal keywords to pass to model.predict()
            For MaxentModels, this would include transform="logistic"

    Returns:
        None: saves model predictions to disk.
    """
    # make sure the raster_paths are iterable
    raster_paths = to_iterable(raster_paths)

    # get and set template parameters
    windows, dst_profile = create_output_raster_profile(
        raster_paths,
        template_idx,
        count=count,
        windowed=windowed,
        nodata=nodata,
        compress=compress,
        driver=driver,
        bigtiff=bigtiff,
    )

    # get the bands and indexes for each covariate raster
    nbands, band_idx = get_raster_band_indexes(raster_paths)

    # check whether the raster paths are aligned to determine how the data are read
    aligned = check_raster_alignment(raster_paths)

    # set a dummy nodata variable if none is set
    # (acutal nodata reads handled by rasterios src.read(masked=True) method)
    nodata = nodata or 0

    # turn off sklearn warnings
    if ignore_sklearn:
        warnings.filterwarnings("ignore", category=UserWarning)

    # open all rasters to read from later
    srcs = [rio.open(raster_path) for raster_path in raster_paths]

    # use warped VRT reads to align all rasters pixel-pixel if not aligned
    if not aligned:
        vrt_options = {
            "resampling": resampling,
            "transform": dst_profile["transform"],
            "crs": dst_profile["crs"],
            "height": dst_profile["height"],
            "width": dst_profile["width"],
        }
        srcs = [rio.vrt.WarpedVRT(src, **vrt_options) for src in srcs]

    # read and reproject blocks from each data source and write predictions to disk
    with rio.open(output_path, "w", **dst_profile) as dst:
        for window in tqdm(windows, desc="Window", **tqdm_opts):

            # create stacked arrays to handle multi-raster, multi-band inputs
            # that may have different nodata locations
            covariates = np.zeros((nbands, window.height, window.width),
                                  dtype=np.float32)
            nodata_idx = np.ones_like(covariates, dtype=bool)

            try:
                for i, src in enumerate(srcs):
                    data = src.read(window=window, masked=True)
                    covariates[band_idx[i]:band_idx[i + 1]] = data
                    nodata_idx[band_idx[i]:band_idx[i + 1]] = data.mask

                    # skip blocks full of no-data
                    if data.mask.all():
                        raise NoDataException()

                predictions = apply_model_to_array(
                    model,
                    covariates,
                    nodata,
                    nodata_idx,
                    count=count,
                    dtype=dtype,
                    predict_proba=predict_proba,
                    **kwargs,
                )
                dst.write(predictions, window=window)

            except NoDataException:
                continue
Esempio n. 6
0
def annotate_geoseries(points: gpd.GeoSeries,
                       raster_paths: list,
                       labels: list = None,
                       drop_na: bool = True,
                       dtype: str = None) -> gpd.GeoDataFrame:
    """Reads and stores pixel values from rasters using point locations.

    Args:
        points: GeoSeries with point locations.
        raster_paths: rasters to extract pixel values from.
        labels: band labels. must match the total number of bands for all raster_paths.
        drop_na: drop records with no-data values.
        dtype: output column data type. uses the first raster's dtype by default.

    Returns:
        gdf: GeoDataFrame annotated with the pixel values from each raster
    """
    # format the inputs
    raster_paths = to_iterable(raster_paths)
    labels = format_band_labels(raster_paths, labels)

    # get the dataset dimensions
    n_rasters = len(raster_paths)
    n_points = len(points)

    # create arrays and flags for updating
    raster_values = []
    valid_idxs = []
    nodata_flag = False

    # annotate each point with the pixel values for each raster
    for raster_idx, raster_path in tqdm(enumerate(raster_paths),
                                        desc="Raster",
                                        total=n_rasters,
                                        **tqdm_opts):
        with rio.open(raster_path, "r") as src:

            # reproject points to match raster and convert to a dataframe
            if not crs_match(points.crs, src.crs):
                points.to_crs(src.crs, inplace=True)

            # use the first rasters dtype for the output array if not set
            if raster_idx == 0 and dtype is None:
                dtype = src.dtypes[0]

            # get the raster row/col indices for each point and the respective read windows
            xys = [(point.x, point.y) for point in points]

            # read each pixel value
            samples = src.sample(xys, masked=False)

            # assign to an output array
            outarr = np.zeros((n_points, src.count), dtype=dtype)
            for idx, sample in enumerate(samples):
                outarr[idx] = sample

            # identify nodata points to remove later
            if drop_na and src.nodata is not None:
                nodata_flag = True
                valid_idxs.append(outarr[:, 0] != src.nodata)

            raster_values.append(outarr)

    # merge the arrays from each raster
    values = np.concatenate(raster_values, axis=1, dtype=dtype)

    if nodata_flag:
        valid = np.max(valid_idxs, axis=0)
        values = values[valid, :]
        points = points.iloc[valid]
        points.index = range(valid.sum())

    # convert to a geodataframe
    gdf = gpd.GeoDataFrame(values, geometry=points.geometry, columns=labels)

    return gdf