Exemple #1
0
def open_rasterio(
    filename,
    parse_coordinates=None,
    chunks=None,
    cache=None,
    lock=None,
    masked=False,
    mask_and_scale=False,
    variable=None,
    group=None,
    default_name=None,
    **open_kwargs,
):
    """Open a file with rasterio (experimental).

    This should work with any file that rasterio can open (most often:
    geoTIFF). The x and y coordinates are generated automatically from the
    file's geoinformation, shifted to the center of each pixel (see
    `"PixelIsArea" Raster Space
    <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_
    for more information).

    You can generate 2D coordinates from the file's attributes with::

        from affine import Affine
        da = xr.open_rasterio('path_to_file.tif')
        transform = Affine.from_gdal(*da.attrs['transform'])
        nx, ny = da.sizes['x'], da.sizes['y']
        x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform


    Parameters
    ----------
    filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT
        Path to the file to open. Or already open rasterio dataset.
    parse_coordinates: bool, optional
        Whether to parse the x and y coordinates out of the file's
        ``transform`` attribute or not. The default is to automatically
        parse the coordinates only if they are rectilinear (1D).
        It can be useful to set ``parse_coordinates=False``
        if your files are very large or if you don't need the coordinates.
    chunks: int, tuple or dict, optional
        Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
        ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new
        DataArray into a dask array. Chunks can also be set to
        ``True`` or ``"auto"`` to choose sensible chunk sizes according to
        ``dask.config.get("array.chunk-size")``.
    cache: bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False.
    lock: False, True or threading.Lock, optional
        If chunks is provided, this argument is passed on to
        :py:func:`dask.array.from_array`. By default, a global lock is
        used to avoid issues with concurrent access to the same file when using
        dask's multithreaded backend.
    masked: bool, optional
        If True, read the mask and set values to NaN. Defaults to False.
    mask_and_scale: bool, optional
        Lazily scale (using the `scales` and `offsets` from rasterio) and mask.
        If the _Unsigned attribute is present treat integer arrays as unsigned.
    variable: str or list or tuple, optional
        Variable name or names to use to filter loading.
    group: str or list or tuple, optional
        Group name or names to use to filter loading.
    default_name: str, optional
        The name of the data array if none exists. Default is None.
    **open_kwargs: kwargs, optional
        Optional keyword arguments to pass into rasterio.open().

    Returns
    -------
    :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]:
        The newly created dataset(s).
    """
    parse_coordinates = True if parse_coordinates is None else parse_coordinates
    masked = masked or mask_and_scale
    vrt_params = None
    if isinstance(filename, rasterio.io.DatasetReader):
        filename = filename.name
    elif isinstance(filename, rasterio.vrt.WarpedVRT):
        vrt = filename
        filename = vrt.src_dataset.name
        vrt_params = dict(
            src_crs=vrt.src_crs.to_string(),
            crs=vrt.crs.to_string(),
            resampling=vrt.resampling,
            tolerance=vrt.tolerance,
            src_nodata=vrt.src_nodata,
            nodata=vrt.nodata,
            width=vrt.width,
            height=vrt.height,
            src_transform=vrt.src_transform,
            transform=vrt.transform,
            dtype=vrt.working_dtype,
            warp_extras=vrt.warp_extras,
        )

    if lock is None:
        lock = RASTERIO_LOCK

    # ensure default for sharing is False
    # ref https://github.com/mapbox/rasterio/issues/1504
    open_kwargs["sharing"] = open_kwargs.get("sharing", False)
    with warnings.catch_warnings(record=True) as rio_warnings:
        manager = CachingFileManager(rasterio.open,
                                     filename,
                                     lock=lock,
                                     mode="r",
                                     kwargs=open_kwargs)
        riods = manager.acquire()
        captured_warnings = rio_warnings.copy()
    # raise the NotGeoreferencedWarning if applicable
    for rio_warning in captured_warnings:
        if not riods.subdatasets or not isinstance(rio_warning.message,
                                                   NotGeoreferencedWarning):
            warnings.warn(str(rio_warning.message), type(rio_warning.message))

    # open the subdatasets if they exist
    if riods.subdatasets:
        return _load_subdatasets(
            riods=riods,
            group=group,
            variable=variable,
            parse_coordinates=parse_coordinates,
            chunks=chunks,
            cache=cache,
            lock=lock,
            masked=masked,
            mask_and_scale=mask_and_scale,
        )

    if vrt_params is not None:
        riods = WarpedVRT(riods, **vrt_params)

    if cache is None:
        cache = chunks is None

    # Get bands
    if riods.count < 1:
        raise ValueError("Unknown dims")

    # parse tags & load alternate coords
    attrs = _get_rasterio_attrs(riods=riods)
    coords = _load_netcdf_1d_coords(riods.tags())
    _parse_driver_tags(riods=riods, attrs=attrs, coords=coords)
    for coord in coords:
        if f"NETCDF_DIM_{coord}" in attrs:
            coord_name = coord
            attrs.pop(f"NETCDF_DIM_{coord}")
            break
    else:
        coord_name = "band"
        coords[coord_name] = np.asarray(riods.indexes)

    # Get geospatial coordinates
    transform = _rio_transform(riods)
    if parse_coordinates and transform.is_rectilinear:
        # 1d coordinates
        coords.update(
            affine_to_coords(riods.transform, riods.width, riods.height))
    elif parse_coordinates:
        # 2d coordinates
        warnings.warn(
            "The file coordinates' transformation isn't "
            "rectilinear: xarray won't parse the coordinates "
            "in this case. Set `parse_coordinates=False` to "
            "suppress this warning.",
            RuntimeWarning,
            stacklevel=3,
        )

    unsigned = False
    encoding = {}
    if mask_and_scale and "_Unsigned" in attrs:
        unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true"

    da_name = attrs.pop("NETCDF_VARNAME", default_name)
    data = indexing.LazilyOuterIndexedArray(
        RasterioArrayWrapper(
            manager,
            lock,
            name=da_name,
            vrt_params=vrt_params,
            masked=masked,
            mask_and_scale=mask_and_scale,
            unsigned=unsigned,
        ))

    # this lets you write arrays loaded with rasterio
    data = indexing.CopyOnWriteArray(data)
    if cache and chunks is None:
        data = indexing.MemoryCachedArray(data)

    result = DataArray(data=data,
                       dims=(coord_name, "y", "x"),
                       coords=coords,
                       attrs=attrs,
                       name=da_name)
    result.encoding = encoding

    # update attributes from NetCDF attributess
    _load_netcdf_attrs(riods.tags(), result)
    result = _decode_datetime_cf(result)

    # make sure the _FillValue is correct dtype
    if "_FillValue" in attrs:
        attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"])

    # handle encoding
    if mask_and_scale:
        if "scale_factor" in result.attrs:
            variables.pop_to(result.attrs,
                             result.encoding,
                             "scale_factor",
                             name=da_name)
        if "add_offset" in result.attrs:
            variables.pop_to(result.attrs,
                             result.encoding,
                             "add_offset",
                             name=da_name)
    if masked:
        if "_FillValue" in result.attrs:
            variables.pop_to(result.attrs,
                             result.encoding,
                             "_FillValue",
                             name=da_name)
        if "missing_value" in result.attrs:
            variables.pop_to(result.attrs,
                             result.encoding,
                             "missing_value",
                             name=da_name)

    # Affine transformation matrix (always available)
    # This describes coefficients mapping pixel coordinates to CRS
    # For serialization store as tuple of 6 floats, the last row being
    # always (0, 0, 1) per definition (see
    # https://github.com/sgillies/affine)
    result.rio.write_transform(riods.transform, inplace=True)
    if hasattr(riods, "crs") and riods.crs:
        result.rio.write_crs(riods.crs, inplace=True)

    if chunks is not None:
        result = _prepare_dask(result, riods, filename, chunks)

    # Make the file closeable
    result._file_obj = manager

    return result
Exemple #2
0
def open_rasterio(
    filename,
    parse_coordinates=None,
    chunks=None,
    cache=None,
    lock=None,
    masked=False,
    mask_and_scale=False,
    variable=None,
    group=None,
    default_name=None,
    decode_times=True,
    decode_timedelta=None,
    **open_kwargs,
):
    # pylint: disable=too-many-statements,too-many-locals,too-many-branches
    """Open a file with rasterio (experimental).

    This should work with any file that rasterio can open (most often:
    geoTIFF). The x and y coordinates are generated automatically from the
    file's geoinformation, shifted to the center of each pixel (see
    `"PixelIsArea" Raster Space
    <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_
    for more information).

    Parameters
    ----------
    filename: str, rasterio.io.DatasetReader, or rasterio.vrt.WarpedVRT
        Path to the file to open. Or already open rasterio dataset.
    parse_coordinates: bool, optional
        Whether to parse the x and y coordinates out of the file's
        ``transform`` attribute or not. The default is to automatically
        parse the coordinates only if they are rectilinear (1D).
        It can be useful to set ``parse_coordinates=False``
        if your files are very large or if you don't need the coordinates.
    chunks: int, tuple or dict, optional
        Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
        ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new
        DataArray into a dask array. Chunks can also be set to
        ``True`` or ``"auto"`` to choose sensible chunk sizes according to
        ``dask.config.get("array.chunk-size")``.
    cache: bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False.
    lock: bool or dask.utils.SerializableLock, optional

        If chunks is provided, this argument is used to ensure that only one
        thread per process is reading from a rasterio file object at a time.

        By default and when a lock instance is provided,
        a :class:`xarray.backends.CachingFileManager` is used to cache File objects.
        Since rasterio also caches some data, this will make repeated reads from the
        same object fast.

        When ``lock=False``, no lock is used, allowing for completely parallel reads
        from multiple threads or processes. However, a new file handle is opened on
        each request.

    masked: bool, optional
        If True, read the mask and set values to NaN. Defaults to False.
    mask_and_scale: bool, optional
        Lazily scale (using the `scales` and `offsets` from rasterio) and mask.
        If the _Unsigned attribute is present treat integer arrays as unsigned.
    variable: str or list or tuple, optional
        Variable name or names to use to filter loading.
    group: str or list or tuple, optional
        Group name or names to use to filter loading.
    default_name: str, optional
        The name of the data array if none exists. Default is None.
    decode_times: bool, optional
        If True, decode times encoded in the standard NetCDF datetime format
        into datetime objects. Otherwise, leave them encoded as numbers.
    decode_timedelta: bool, optional
        If True, decode variables and coordinates with time units in
        {“days”, “hours”, “minutes”, “seconds”, “milliseconds”, “microseconds”}
        into timedelta objects. If False, leave them encoded as numbers.
        If None (default), assume the same value of decode_time.
    **open_kwargs: kwargs, optional
        Optional keyword arguments to pass into rasterio.open().

    Returns
    -------
    :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]:
        The newly created dataset(s).
    """
    parse_coordinates = True if parse_coordinates is None else parse_coordinates
    masked = masked or mask_and_scale
    vrt_params = None
    if isinstance(filename, rasterio.io.DatasetReader):
        filename = filename.name
    elif isinstance(filename, rasterio.vrt.WarpedVRT):
        vrt = filename
        filename = vrt.src_dataset.name
        vrt_params = dict(
            src_crs=vrt.src_crs.to_string() if vrt.src_crs else None,
            crs=vrt.crs.to_string() if vrt.crs else None,
            resampling=vrt.resampling,
            tolerance=vrt.tolerance,
            src_nodata=vrt.src_nodata,
            nodata=vrt.nodata,
            width=vrt.width,
            height=vrt.height,
            src_transform=vrt.src_transform,
            transform=vrt.transform,
            dtype=vrt.working_dtype,
            warp_extras=vrt.warp_extras,
        )

    if lock in (True, None):
        lock = RASTERIO_LOCK
    elif lock is False:
        lock = NO_LOCK

    # ensure default for sharing is False
    # ref https://github.com/mapbox/rasterio/issues/1504
    open_kwargs["sharing"] = open_kwargs.get("sharing", False)

    with warnings.catch_warnings(record=True) as rio_warnings:
        if lock is not NO_LOCK:
            manager = CachingFileManager(
                rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs
            )
        else:
            manager = URIManager(rasterio.open, filename, mode="r", kwargs=open_kwargs)
        riods = manager.acquire()
        captured_warnings = rio_warnings.copy()

    # raise the NotGeoreferencedWarning if applicable
    for rio_warning in captured_warnings:
        if not riods.subdatasets or not isinstance(
            rio_warning.message, NotGeoreferencedWarning
        ):
            warnings.warn(str(rio_warning.message), type(rio_warning.message))

    # open the subdatasets if they exist
    if riods.subdatasets:
        return _load_subdatasets(
            riods=riods,
            group=group,
            variable=variable,
            parse_coordinates=parse_coordinates,
            chunks=chunks,
            cache=cache,
            lock=lock,
            masked=masked,
            mask_and_scale=mask_and_scale,
            decode_times=decode_times,
            decode_timedelta=decode_timedelta,
            **open_kwargs,
        )

    if vrt_params is not None:
        riods = WarpedVRT(riods, **vrt_params)

    if cache is None:
        cache = chunks is None

    # Get bands
    if riods.count < 1:
        raise ValueError("Unknown dims")

    # parse tags & load alternate coords
    attrs = _get_rasterio_attrs(riods=riods)
    coords = _load_netcdf_1d_coords(riods.tags())
    _parse_driver_tags(riods=riods, attrs=attrs, coords=coords)
    for coord in coords:
        if f"NETCDF_DIM_{coord}" in attrs:
            coord_name = coord
            attrs.pop(f"NETCDF_DIM_{coord}")
            break
    else:
        coord_name = "band"
        coords[coord_name] = np.asarray(riods.indexes)

    has_gcps = riods.gcps[0]
    if has_gcps:
        parse_coordinates = False

    # Get geospatial coordinates
    if parse_coordinates:
        coords.update(
            _generate_spatial_coords(_rio_transform(riods), riods.width, riods.height)
        )

    unsigned = False
    encoding = {}
    if mask_and_scale and "_Unsigned" in attrs:
        unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true"

    if masked:
        encoding["dtype"] = str(_rasterio_to_numpy_dtype(riods.dtypes))

    da_name = attrs.pop("NETCDF_VARNAME", default_name)
    data = indexing.LazilyOuterIndexedArray(
        RasterioArrayWrapper(
            manager,
            lock,
            name=da_name,
            vrt_params=vrt_params,
            masked=masked,
            mask_and_scale=mask_and_scale,
            unsigned=unsigned,
        )
    )

    # this lets you write arrays loaded with rasterio
    data = indexing.CopyOnWriteArray(data)
    if cache and chunks is None:
        data = indexing.MemoryCachedArray(data)

    result = DataArray(
        data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name
    )
    result.encoding = encoding

    # update attributes from NetCDF attributess
    _load_netcdf_attrs(riods.tags(), result)
    result = _decode_datetime_cf(
        result, decode_times=decode_times, decode_timedelta=decode_timedelta
    )

    # make sure the _FillValue is correct dtype
    if "_FillValue" in attrs:
        attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"])

    # handle encoding
    _handle_encoding(result, mask_and_scale, masked, da_name)
    # Affine transformation matrix (always available)
    # This describes coefficients mapping pixel coordinates to CRS
    # For serialization store as tuple of 6 floats, the last row being
    # always (0, 0, 1) per definition (see
    # https://github.com/sgillies/affine)
    result.rio.write_transform(_rio_transform(riods), inplace=True)
    if riods.crs:
        result.rio.write_crs(riods.crs, inplace=True)
    if has_gcps:
        result.rio.write_gcps(*riods.gcps, inplace=True)

    if chunks is not None:
        result = _prepare_dask(result, riods, filename, chunks)

    # Make the file closeable
    result.set_close(manager.close)
    result.rio._manager = manager
    # add file path to encoding
    result.encoding["source"] = riods.name
    result.encoding["rasterio_dtype"] = str(riods.dtypes[0])
    return result