def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, **open_kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string(), crs=vrt.crs.to_string(), resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: manager = CachingFileManager(rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance(rio_warning.message, NotGeoreferencedWarning): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) # Get geospatial coordinates transform = _rio_transform(riods) if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update( affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, )) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray(data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf(result) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding if mask_and_scale: if "scale_factor" in result.attrs: variables.pop_to(result.attrs, result.encoding, "scale_factor", name=da_name) if "add_offset" in result.attrs: variables.pop_to(result.attrs, result.encoding, "add_offset", name=da_name) if masked: if "_FillValue" in result.attrs: variables.pop_to(result.attrs, result.encoding, "_FillValue", name=da_name) if "missing_value" in result.attrs: variables.pop_to(result.attrs, result.encoding, "missing_value", name=da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(riods.transform, inplace=True) if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, decode_times=True, decode_timedelta=None, **open_kwargs, ): # pylint: disable=too-many-statements,too-many-locals,too-many-branches """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). Parameters ---------- filename: str, rasterio.io.DatasetReader, or rasterio.vrt.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: bool or dask.utils.SerializableLock, optional If chunks is provided, this argument is used to ensure that only one thread per process is reading from a rasterio file object at a time. By default and when a lock instance is provided, a :class:`xarray.backends.CachingFileManager` is used to cache File objects. Since rasterio also caches some data, this will make repeated reads from the same object fast. When ``lock=False``, no lock is used, allowing for completely parallel reads from multiple threads or processes. However, a new file handle is opened on each request. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. decode_times: bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. decode_timedelta: bool, optional If True, decode variables and coordinates with time units in {“days”, “hours”, “minutes”, “seconds”, “milliseconds”, “microseconds”} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string() if vrt.src_crs else None, crs=vrt.crs.to_string() if vrt.crs else None, resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock in (True, None): lock = RASTERIO_LOCK elif lock is False: lock = NO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: if lock is not NO_LOCK: manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) else: manager = URIManager(rasterio.open, filename, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance( rio_warning.message, NotGeoreferencedWarning ): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, decode_times=decode_times, decode_timedelta=decode_timedelta, **open_kwargs, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) has_gcps = riods.gcps[0] if has_gcps: parse_coordinates = False # Get geospatial coordinates if parse_coordinates: coords.update( _generate_spatial_coords(_rio_transform(riods), riods.width, riods.height) ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" if masked: encoding["dtype"] = str(_rasterio_to_numpy_dtype(riods.dtypes)) da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, ) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray( data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf( result, decode_times=decode_times, decode_timedelta=decode_timedelta ) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding _handle_encoding(result, mask_and_scale, masked, da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(_rio_transform(riods), inplace=True) if riods.crs: result.rio.write_crs(riods.crs, inplace=True) if has_gcps: result.rio.write_gcps(*riods.gcps, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result.set_close(manager.close) result.rio._manager = manager # add file path to encoding result.encoding["source"] = riods.name result.encoding["rasterio_dtype"] = str(riods.dtypes[0]) return result