def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v = Variable(['i', 'j', 'k'], original) lazy = indexing.LazilyOuterIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name # test orthogonally applied indexers indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] for i in indexers: for j in indexers: for k in indexers: if isinstance(j, np.ndarray) and j.dtype.kind == 'b': j = np.arange(20) < 5 if isinstance(k, np.ndarray) and k.dtype.kind == 'b': k = np.arange(30) < 5 expected = np.asarray(v[i, j, k]) for actual in [ v_lazy[i, j, k], v_lazy[:, j, k][i], v_lazy[:, :, k][:, j][i] ]: assert expected.shape == actual.shape assert_array_equal(expected, actual) assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) # make sure actual.key is appropriate type if all( isinstance(k, ( int, slice, )) for k in v_lazy._data.key.tuple): assert isinstance(v_lazy._data.key, indexing.BasicIndexer) else: assert isinstance(v_lazy._data.key, indexing.OuterIndexer) # test sequentially applied indexers indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] for i, j in indexers: expected = v[i][j] actual = v_lazy[i][j] assert expected.shape == actual.shape assert_array_equal(expected, actual) # test transpose if actual.ndim > 1: order = np.random.choice(actual.ndim, actual.ndim) order = np.array(actual.dims) transposed = actual.transpose(*order) assert_array_equal(expected.transpose(*order), transposed) assert isinstance(actual._data, (indexing.LazilyVectorizedIndexedArray, indexing.LazilyOuterIndexedArray)) assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter)
def test_sub_array(self): original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) child = wrapped[B[:5]] assert isinstance(child, indexing.MemoryCachedArray) assert_array_equal(child, np.arange(5)) assert isinstance(child.array, indexing.NumpyIndexingAdapter) assert isinstance(wrapped.array, indexing.LazilyOuterIndexedArray)
def open_store_variable(self, name, var): if isinstance(var.data, np.ndarray): data = var.data else: wrapped_array = CfGribArrayWrapper(self, var.data) data = indexing.LazilyOuterIndexedArray(wrapped_array) encoding = self.ds.encoding.copy() encoding['original_shape'] = var.data.shape return Variable(var.dimensions, data, var.attributes, encoding)
def test_vectorized_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v_eager = Variable(["i", "j", "k"], x) lazy = indexing.LazilyOuterIndexedArray(x) v_lazy = Variable(["i", "j", "k"], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name def check_indexing(v_eager, v_lazy, indexers): for indexer in indexers: actual = v_lazy[indexer] expected = v_eager[indexer] assert expected.shape == actual.shape assert isinstance( actual._data, ( indexing.LazilyVectorizedIndexedArray, indexing.LazilyOuterIndexedArray, ), ) assert_array_equal(expected, actual) v_eager = expected v_lazy = actual # test orthogonal indexing indexers = [(I[:], 0, 1), (Variable("i", [0, 1]), )] check_indexing(v_eager, v_lazy, indexers) # vectorized indexing indexers = [ (Variable("i", [0, 1]), Variable("i", [0, 1]), slice(None)), (slice(1, 3, 2), 0), ] check_indexing(v_eager, v_lazy, indexers) indexers = [ (slice(None, None, 2), 0, slice(None, 10)), (Variable("i", [3, 2, 4, 3]), Variable("i", [3, 2, 1, 0])), (Variable(["i", "j"], [[0, 1], [1, 2]]), ), ] check_indexing(v_eager, v_lazy, indexers) indexers = [ (Variable("i", [3, 2, 4, 3]), Variable("i", [3, 2, 1, 0])), (Variable(["i", "j"], [[0, 1], [1, 2]]), ), ] check_indexing(v_eager, v_lazy, indexers)
def open_store_variable(self, name, tiledb_array): # TODO: What / why was LazilyOuterIndexedArray being used? dimensions, attributes = _get_tiledb_dims_and_attrs( tiledb_array, _DIMENSION_KEY) attributes = dict(attributes) # encoding = { # "chunks": zarr_array.chunks, # "compressor": zarr_array.compressor, # "filters": zarr_array.filters, # } # _FillValue needs to be in attributes, not encoding, so it will get # picked up by decode_cf # TODO: fill value??? # if getattr(attributes, "fill_value") is not None: # attributes["_FillValue"] = zarr_array.fill_value data = indexing.LazilyOuterIndexedArray(LazyTileDB(tiledb_array)) return Variable(dimensions, data, attributes)
def open_store_variable(self, name, var): if isinstance(var.data, cfgrib.dataset.OnDiskArray): data = indexing.LazilyOuterIndexedArray(WrapGrib(var.data)) else: data = var.data dimensions = tuple(self.variable_map.get(dim, dim) for dim in var.dimensions) attrs = var.attributes # the coordinates attributes need a special treatment if 'coordinates' in attrs: coordinates = [self.variable_map.get(d, d) for d in attrs['coordinates'].split()] attrs['coordinates'] = ' '.join(coordinates) encoding = {} # save source so __repr__ can detect if it's local or not encoding['source'] = self.ds.stream.path encoding['original_shape'] = var.data.shape return Variable(dimensions, data, attrs, encoding)
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, **open_kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string(), crs=vrt.crs.to_string(), resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: manager = CachingFileManager(rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance(rio_warning.message, NotGeoreferencedWarning): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) # Get geospatial coordinates transform = _rio_transform(riods) if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update( affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, )) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray(data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf(result) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding if mask_and_scale: if "scale_factor" in result.attrs: variables.pop_to(result.attrs, result.encoding, "scale_factor", name=da_name) if "add_offset" in result.attrs: variables.pop_to(result.attrs, result.encoding, "add_offset", name=da_name) if masked: if "_FillValue" in result.attrs: variables.pop_to(result.attrs, result.encoding, "_FillValue", name=da_name) if "missing_value" in result.attrs: variables.pop_to(result.attrs, result.encoding, "missing_value", name=da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(riods.transform, inplace=True) if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, **open_kwargs ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates : bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks : int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size"). cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock : False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked : bool, optional If True, read the mask and to set values to NaN. Defaults to False. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- data : DataArray The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates import rasterio from rasterio.vrt import WarpedVRT vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( crs=vrt.crs.to_string(), resampling=vrt.resampling, src_nodata=vrt.src_nodata, dst_nodata=vrt.dst_nodata, tolerance=vrt.tolerance, transform=vrt.transform, width=vrt.width, height=vrt.height, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() # open the subdatasets if they exist if riods.subdatasets: data_arrays = {} for iii, subdataset in enumerate(riods.subdatasets): rioda = open_rasterio( subdataset, parse_coordinates=iii == 0 and parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, ) data_arrays[rioda.name] = rioda return Dataset(data_arrays) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None coords = OrderedDict() # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords["band"] = np.asarray(riods.indexes) # Get coordinates if LooseVersion(rasterio.__version__) < LooseVersion("1.0"): transform = riods.affine else: transform = riods.transform if transform.is_rectilinear and parse_coordinates: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) # Attributes attrs = _parse_tags(riods.tags(1)) encoding = dict() # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) attrs["transform"] = tuple(transform)[:6] if hasattr(riods, "nodata") and riods.nodata is not None: # The nodata values for the raster bands if masked: encoding["_FillValue"] = riods.nodata else: attrs["_FillValue"] = riods.nodata if hasattr(riods, "scales"): # The scale values for the raster bands attrs["scales"] = riods.scales if hasattr(riods, "offsets"): # The offset values for the raster bands attrs["offsets"] = riods.offsets if hasattr(riods, "descriptions") and any(riods.descriptions): # Descriptions for each dataset band attrs["descriptions"] = riods.descriptions if hasattr(riods, "units") and any(riods.units): # A list of units string for each dataset band attrs["units"] = riods.units # Parse extra metadata from tags, if supported parsers = {"ENVI": _parse_envi} driver = riods.driver if driver in parsers: meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, # as attributes otherwise if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: coords[k] = ("band", np.asarray(v)) else: attrs[k] = v data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) da_name = attrs.pop("NETCDF_VARNAME", None) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: from dask.base import tokenize # augment the token with the file modification time try: mtime = os.path.getmtime(filename) except OSError: # the filename is probably an s3 bucket rather than a regular file mtime = None if chunks in (True, "auto"): from dask.array.core import normalize_chunks import dask if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): msg = ( "Automatic chunking requires dask.__version__ >= 0.18.0 . " "You currently have version %s" % dask.__version__ ) raise NotImplementedError(msg) block_shape = (1,) + riods.block_shapes[0] chunks = normalize_chunks( chunks=(1, "auto", "auto"), shape=(riods.count, riods.height, riods.width), dtype=riods.dtypes[0], previous_chunks=tuple((c,) for c in block_shape), ) token = tokenize(filename, mtime, chunks) name_prefix = "open_rasterio-%s" % token result = result.chunk(chunks, name_prefix=name_prefix, token=token) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, decode_times=True, decode_timedelta=None, **open_kwargs, ): # pylint: disable=too-many-statements,too-many-locals,too-many-branches """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). Parameters ---------- filename: str, rasterio.io.DatasetReader, or rasterio.vrt.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: bool or dask.utils.SerializableLock, optional If chunks is provided, this argument is used to ensure that only one thread per process is reading from a rasterio file object at a time. By default and when a lock instance is provided, a :class:`xarray.backends.CachingFileManager` is used to cache File objects. Since rasterio also caches some data, this will make repeated reads from the same object fast. When ``lock=False``, no lock is used, allowing for completely parallel reads from multiple threads or processes. However, a new file handle is opened on each request. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. decode_times: bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. decode_timedelta: bool, optional If True, decode variables and coordinates with time units in {“days”, “hours”, “minutes”, “seconds”, “milliseconds”, “microseconds”} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string() if vrt.src_crs else None, crs=vrt.crs.to_string() if vrt.crs else None, resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock in (True, None): lock = RASTERIO_LOCK elif lock is False: lock = NO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: if lock is not NO_LOCK: manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) else: manager = URIManager(rasterio.open, filename, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance( rio_warning.message, NotGeoreferencedWarning ): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, decode_times=decode_times, decode_timedelta=decode_timedelta, **open_kwargs, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) has_gcps = riods.gcps[0] if has_gcps: parse_coordinates = False # Get geospatial coordinates if parse_coordinates: coords.update( _generate_spatial_coords(_rio_transform(riods), riods.width, riods.height) ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" if masked: encoding["dtype"] = str(_rasterio_to_numpy_dtype(riods.dtypes)) da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, ) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray( data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf( result, decode_times=decode_times, decode_timedelta=decode_timedelta ) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding _handle_encoding(result, mask_and_scale, masked, da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(_rio_transform(riods), inplace=True) if riods.crs: result.rio.write_crs(riods.crs, inplace=True) if has_gcps: result.rio.write_gcps(*riods.gcps, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result.set_close(manager.close) result.rio._manager = manager # add file path to encoding result.encoding["source"] = riods.name result.encoding["rasterio_dtype"] = str(riods.dtypes[0]) return result
def test_wrapper(self): original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) assert_array_equal(wrapped, np.arange(10)) assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter)
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, variable=None, group=None, default_name=None, **open_kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size"). cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked: bool, optional If True, read the mask and to set values to NaN. Defaults to False. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- data : DataArray The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( crs=vrt.crs.to_string(), resampling=vrt.resampling, src_nodata=vrt.src_nodata, dst_nodata=vrt.dst_nodata, tolerance=vrt.tolerance, transform=vrt.transform, width=vrt.width, height=vrt.height, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords = OrderedDict() coords["band"] = np.asarray(riods.indexes) # parse tags attrs, encoding = _get_rasterio_attrs(riods=riods, masked=masked) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) # Get geospatial coordinates transform = _rio_transform(riods) if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) # create the output data array da_name = attrs.pop("NETCDF_VARNAME", default_name) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager return result
def open_store_variable(self, name, var): """Turn CDMRemote variable into something like a numpy.ndarray.""" data = indexing.LazilyOuterIndexedArray(CDMArrayWrapper(name, self)) return Variable(var.dimensions, data, {a: getattr(var, a) for a in var.ncattrs()})