def test_file_manager_refcounts() -> None: mock_file = mock.Mock() opener = mock.Mock(spec=open, return_value=mock_file) cache: dict = {} ref_counts: dict = {} manager = CachingFileManager(opener, "filename", cache=cache, ref_counts=ref_counts) assert ref_counts[manager._key] == 1 manager.acquire() assert cache manager2 = CachingFileManager( opener, "filename", cache=cache, ref_counts=ref_counts ) assert cache assert manager._key == manager2._key assert ref_counts[manager._key] == 2 with set_options(warn_for_unclosed_files=False): del manager gc.collect() assert cache assert ref_counts[manager2._key] == 1 mock_file.close.assert_not_called() with set_options(warn_for_unclosed_files=False): del manager2 gc.collect() assert not ref_counts assert not cache
def test_file_manager_refcounts(): mock_file = mock.Mock() opener = mock.Mock(spec=open, return_value=mock_file) cache = {} ref_counts = {} manager = CachingFileManager( opener, 'filename', cache=cache, ref_counts=ref_counts) assert ref_counts[manager._key] == 1 manager.acquire() assert cache manager2 = CachingFileManager( opener, 'filename', cache=cache, ref_counts=ref_counts) assert cache assert manager._key == manager2._key assert ref_counts[manager._key] == 2 with set_options(warn_for_unclosed_files=False): del manager gc.collect() assert cache assert ref_counts[manager2._key] == 1 mock_file.close.assert_not_called() with set_options(warn_for_unclosed_files=False): del manager2 gc.collect() assert not ref_counts assert not cache
def test_file_manager_replace_object() -> None: opener = mock.Mock() cache: dict = {} ref_counts: dict = {} manager = CachingFileManager(opener, "filename", cache=cache, ref_counts=ref_counts) manager.acquire() assert ref_counts[manager._key] == 1 assert cache manager = CachingFileManager(opener, "filename", cache=cache, ref_counts=ref_counts) assert ref_counts[manager._key] == 1 assert cache manager.close()
def test_file_manager_autoclose(expected_warning): mock_file = mock.Mock() opener = mock.Mock(return_value=mock_file) cache = {} manager = CachingFileManager(opener, 'filename', cache=cache) manager.acquire() assert cache with set_options(warn_for_unclosed_files=expected_warning is not None): with pytest.warns(expected_warning): del manager gc.collect() assert not cache mock_file.close.assert_called_once_with()
def test_file_manager_autoclose(expected_warning) -> None: mock_file = mock.Mock() opener = mock.Mock(return_value=mock_file) cache: dict = {} manager = CachingFileManager(opener, "filename", cache=cache) manager.acquire() assert cache with set_options(warn_for_unclosed_files=expected_warning is not None): with pytest.warns(expected_warning): del manager gc.collect() assert not cache mock_file.close.assert_called_once_with()
def test_file_manager_autoclose_while_locked(): opener = mock.Mock() lock = threading.Lock() cache = {} manager = CachingFileManager(opener, 'filename', lock=lock, cache=cache) manager.acquire() assert cache lock.acquire() with set_options(warn_for_unclosed_files=False): del manager gc.collect() # can't clear the cache while locked, but also don't block in __del__ assert cache
def test_file_manager_autoclose_while_locked() -> None: opener = mock.Mock() lock = threading.Lock() cache: dict = {} manager = CachingFileManager(opener, "filename", lock=lock, cache=cache) manager.acquire() assert cache lock.acquire() with set_options(warn_for_unclosed_files=False): del manager gc.collect() # can't clear the cache while locked, but also don't block in __del__ assert cache
def test_file_manager_replace_object(): opener = mock.Mock() cache = {} ref_counts = {} manager = CachingFileManager( opener, 'filename', cache=cache, ref_counts=ref_counts) manager.acquire() assert ref_counts[manager._key] == 1 assert cache manager = CachingFileManager( opener, 'filename', cache=cache, ref_counts=ref_counts) assert ref_counts[manager._key] == 1 assert cache manager.close()
def test_file_manager_write_concurrent(tmpdir, file_cache) -> None: path = str(tmpdir.join("testing.txt")) manager = CachingFileManager(open, path, mode="w", cache=file_cache) f1 = manager.acquire() f2 = manager.acquire() f3 = manager.acquire() assert f1 is f2 assert f2 is f3 f1.write("foo") f1.flush() f2.write("bar") f2.flush() f3.write("baz") f3.flush() manager.close() with open(path) as f: assert f.read() == "foobarbaz"
def test_file_manager_write_concurrent(tmpdir, file_cache): path = str(tmpdir.join('testing.txt')) manager = CachingFileManager(open, path, mode='w', cache=file_cache) f1 = manager.acquire() f2 = manager.acquire() f3 = manager.acquire() assert f1 is f2 assert f2 is f3 f1.write('foo') f1.flush() f2.write('bar') f2.flush() f3.write('baz') f3.flush() manager.close() with open(path, 'r') as f: assert f.read() == 'foobarbaz'
def test_file_manager_read(tmpdir, file_cache): path = str(tmpdir.join('testing.txt')) with open(path, 'w') as f: f.write('foobar') manager = CachingFileManager(open, path, cache=file_cache) f = manager.acquire() assert f.read() == 'foobar' manager.close()
def test_file_manager_read(tmpdir, file_cache) -> None: path = str(tmpdir.join("testing.txt")) with open(path, "w") as f: f.write("foobar") manager = CachingFileManager(open, path, cache=file_cache) f = manager.acquire() assert f.read() == "foobar" manager.close()
def test_file_manager_write_consecutive(tmpdir, file_cache) -> None: path1 = str(tmpdir.join("testing1.txt")) path2 = str(tmpdir.join("testing2.txt")) manager1 = CachingFileManager(open, path1, mode="w", cache=file_cache) manager2 = CachingFileManager(open, path2, mode="w", cache=file_cache) f1a = manager1.acquire() f1a.write("foo") f1a.flush() f2 = manager2.acquire() f2.write("bar") f2.flush() f1b = manager1.acquire() f1b.write("baz") assert (getattr(file_cache, "maxsize", float("inf")) > 1) == (f1a is f1b) manager1.close() manager2.close() with open(path1) as f: assert f.read() == "foobaz" with open(path2) as f: assert f.read() == "bar"
def test_file_manager_write_consecutive(tmpdir, file_cache): path1 = str(tmpdir.join('testing1.txt')) path2 = str(tmpdir.join('testing2.txt')) manager1 = CachingFileManager(open, path1, mode='w', cache=file_cache) manager2 = CachingFileManager(open, path2, mode='w', cache=file_cache) f1a = manager1.acquire() f1a.write('foo') f1a.flush() f2 = manager2.acquire() f2.write('bar') f2.flush() f1b = manager1.acquire() f1b.write('baz') assert (getattr(file_cache, 'maxsize', float('inf')) > 1) == (f1a is f1b) manager1.close() manager2.close() with open(path1, 'r') as f: assert f.read() == 'foobaz' with open(path2, 'r') as f: assert f.read() == 'bar'
def test_file_manager_write_pickle(tmpdir, file_cache) -> None: path = str(tmpdir.join("testing.txt")) manager = CachingFileManager(open, path, mode="w", cache=file_cache) f = manager.acquire() f.write("foo") f.flush() manager2 = pickle.loads(pickle.dumps(manager)) f2 = manager2.acquire() f2.write("bar") manager2.close() manager.close() with open(path) as f: assert f.read() == "foobar"
def test_file_manager_write_pickle(tmpdir, file_cache): path = str(tmpdir.join('testing.txt')) manager = CachingFileManager(open, path, mode='w', cache=file_cache) f = manager.acquire() f.write('foo') f.flush() manager2 = pickle.loads(pickle.dumps(manager)) f2 = manager2.acquire() f2.write('bar') manager2.close() manager.close() with open(path, 'r') as f: assert f.read() == 'foobar'
def test_file_manager_mock_write(file_cache) -> None: mock_file = mock.Mock() opener = mock.Mock(spec=open, return_value=mock_file) lock = mock.MagicMock(spec=threading.Lock()) manager = CachingFileManager(opener, "filename", lock=lock, cache=file_cache) f = manager.acquire() f.write("contents") manager.close() assert not file_cache opener.assert_called_once_with("filename") mock_file.write.assert_called_once_with("contents") mock_file.close.assert_called_once_with() lock.__enter__.assert_has_calls([mock.call(), mock.call()])
def test_file_manager_mock_write(file_cache): mock_file = mock.Mock() opener = mock.Mock(spec=open, return_value=mock_file) lock = mock.MagicMock(spec=threading.Lock()) manager = CachingFileManager( opener, 'filename', lock=lock, cache=file_cache) f = manager.acquire() f.write('contents') manager.close() assert not file_cache opener.assert_called_once_with('filename') mock_file.write.assert_called_once_with('contents') mock_file.close.assert_called_once_with() lock.__enter__.assert_has_calls([mock.call(), mock.call()])
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, **open_kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string(), crs=vrt.crs.to_string(), resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: manager = CachingFileManager(rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance(rio_warning.message, NotGeoreferencedWarning): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) # Get geospatial coordinates transform = _rio_transform(riods) if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update( affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, )) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray(data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf(result) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding if mask_and_scale: if "scale_factor" in result.attrs: variables.pop_to(result.attrs, result.encoding, "scale_factor", name=da_name) if "add_offset" in result.attrs: variables.pop_to(result.attrs, result.encoding, "add_offset", name=da_name) if masked: if "_FillValue" in result.attrs: variables.pop_to(result.attrs, result.encoding, "_FillValue", name=da_name) if "missing_value" in result.attrs: variables.pop_to(result.attrs, result.encoding, "missing_value", name=da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(riods.transform, inplace=True) if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, **open_kwargs ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates : bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks : int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size"). cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock : False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked : bool, optional If True, read the mask and to set values to NaN. Defaults to False. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- data : DataArray The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates import rasterio from rasterio.vrt import WarpedVRT vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( crs=vrt.crs.to_string(), resampling=vrt.resampling, src_nodata=vrt.src_nodata, dst_nodata=vrt.dst_nodata, tolerance=vrt.tolerance, transform=vrt.transform, width=vrt.width, height=vrt.height, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() # open the subdatasets if they exist if riods.subdatasets: data_arrays = {} for iii, subdataset in enumerate(riods.subdatasets): rioda = open_rasterio( subdataset, parse_coordinates=iii == 0 and parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, ) data_arrays[rioda.name] = rioda return Dataset(data_arrays) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None coords = OrderedDict() # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords["band"] = np.asarray(riods.indexes) # Get coordinates if LooseVersion(rasterio.__version__) < LooseVersion("1.0"): transform = riods.affine else: transform = riods.transform if transform.is_rectilinear and parse_coordinates: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) # Attributes attrs = _parse_tags(riods.tags(1)) encoding = dict() # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) attrs["transform"] = tuple(transform)[:6] if hasattr(riods, "nodata") and riods.nodata is not None: # The nodata values for the raster bands if masked: encoding["_FillValue"] = riods.nodata else: attrs["_FillValue"] = riods.nodata if hasattr(riods, "scales"): # The scale values for the raster bands attrs["scales"] = riods.scales if hasattr(riods, "offsets"): # The offset values for the raster bands attrs["offsets"] = riods.offsets if hasattr(riods, "descriptions") and any(riods.descriptions): # Descriptions for each dataset band attrs["descriptions"] = riods.descriptions if hasattr(riods, "units") and any(riods.units): # A list of units string for each dataset band attrs["units"] = riods.units # Parse extra metadata from tags, if supported parsers = {"ENVI": _parse_envi} driver = riods.driver if driver in parsers: meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, # as attributes otherwise if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: coords[k] = ("band", np.asarray(v)) else: attrs[k] = v data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) da_name = attrs.pop("NETCDF_VARNAME", None) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: from dask.base import tokenize # augment the token with the file modification time try: mtime = os.path.getmtime(filename) except OSError: # the filename is probably an s3 bucket rather than a regular file mtime = None if chunks in (True, "auto"): from dask.array.core import normalize_chunks import dask if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): msg = ( "Automatic chunking requires dask.__version__ >= 0.18.0 . " "You currently have version %s" % dask.__version__ ) raise NotImplementedError(msg) block_shape = (1,) + riods.block_shapes[0] chunks = normalize_chunks( chunks=(1, "auto", "auto"), shape=(riods.count, riods.height, riods.width), dtype=riods.dtypes[0], previous_chunks=tuple((c,) for c in block_shape), ) token = tokenize(filename, mtime, chunks) name_prefix = "open_rasterio-%s" % token result = result.chunk(chunks, name_prefix=name_prefix, token=token) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, variable=None, group=None, default_name=None, **open_kwargs, ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename: str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size"). cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked: bool, optional If True, read the mask and to set values to NaN. Defaults to False. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- data : DataArray The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( crs=vrt.crs.to_string(), resampling=vrt.resampling, src_nodata=vrt.src_nodata, dst_nodata=vrt.dst_nodata, tolerance=vrt.tolerance, transform=vrt.transform, width=vrt.width, height=vrt.height, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords = OrderedDict() coords["band"] = np.asarray(riods.indexes) # parse tags attrs, encoding = _get_rasterio_attrs(riods=riods, masked=masked) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) # Get geospatial coordinates transform = _rio_transform(riods) if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) # create the output data array da_name = attrs.pop("NETCDF_VARNAME", default_name) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager return result
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, mask_and_scale=False, variable=None, group=None, default_name=None, decode_times=True, decode_timedelta=None, **open_kwargs, ): # pylint: disable=too-many-statements,too-many-locals,too-many-branches """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). Parameters ---------- filename: str, rasterio.io.DatasetReader, or rasterio.vrt.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates: bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks: int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size")``. cache: bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock: bool or dask.utils.SerializableLock, optional If chunks is provided, this argument is used to ensure that only one thread per process is reading from a rasterio file object at a time. By default and when a lock instance is provided, a :class:`xarray.backends.CachingFileManager` is used to cache File objects. Since rasterio also caches some data, this will make repeated reads from the same object fast. When ``lock=False``, no lock is used, allowing for completely parallel reads from multiple threads or processes. However, a new file handle is opened on each request. masked: bool, optional If True, read the mask and set values to NaN. Defaults to False. mask_and_scale: bool, optional Lazily scale (using the `scales` and `offsets` from rasterio) and mask. If the _Unsigned attribute is present treat integer arrays as unsigned. variable: str or list or tuple, optional Variable name or names to use to filter loading. group: str or list or tuple, optional Group name or names to use to filter loading. default_name: str, optional The name of the data array if none exists. Default is None. decode_times: bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. decode_timedelta: bool, optional If True, decode variables and coordinates with time units in {“days”, “hours”, “minutes”, “seconds”, “milliseconds”, “microseconds”} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- :obj:`xarray.Dataset` | :obj:`xarray.DataArray` | List[:obj:`xarray.Dataset`]: The newly created dataset(s). """ parse_coordinates = True if parse_coordinates is None else parse_coordinates masked = masked or mask_and_scale vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( src_crs=vrt.src_crs.to_string() if vrt.src_crs else None, crs=vrt.crs.to_string() if vrt.crs else None, resampling=vrt.resampling, tolerance=vrt.tolerance, src_nodata=vrt.src_nodata, nodata=vrt.nodata, width=vrt.width, height=vrt.height, src_transform=vrt.src_transform, transform=vrt.transform, dtype=vrt.working_dtype, warp_extras=vrt.warp_extras, ) if lock in (True, None): lock = RASTERIO_LOCK elif lock is False: lock = NO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) with warnings.catch_warnings(record=True) as rio_warnings: if lock is not NO_LOCK: manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) else: manager = URIManager(rasterio.open, filename, mode="r", kwargs=open_kwargs) riods = manager.acquire() captured_warnings = rio_warnings.copy() # raise the NotGeoreferencedWarning if applicable for rio_warning in captured_warnings: if not riods.subdatasets or not isinstance( rio_warning.message, NotGeoreferencedWarning ): warnings.warn(str(rio_warning.message), type(rio_warning.message)) # open the subdatasets if they exist if riods.subdatasets: return _load_subdatasets( riods=riods, group=group, variable=variable, parse_coordinates=parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, mask_and_scale=mask_and_scale, decode_times=decode_times, decode_timedelta=decode_timedelta, **open_kwargs, ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None # Get bands if riods.count < 1: raise ValueError("Unknown dims") # parse tags & load alternate coords attrs = _get_rasterio_attrs(riods=riods) coords = _load_netcdf_1d_coords(riods.tags()) _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) for coord in coords: if f"NETCDF_DIM_{coord}" in attrs: coord_name = coord attrs.pop(f"NETCDF_DIM_{coord}") break else: coord_name = "band" coords[coord_name] = np.asarray(riods.indexes) has_gcps = riods.gcps[0] if has_gcps: parse_coordinates = False # Get geospatial coordinates if parse_coordinates: coords.update( _generate_spatial_coords(_rio_transform(riods), riods.width, riods.height) ) unsigned = False encoding = {} if mask_and_scale and "_Unsigned" in attrs: unsigned = variables.pop_to(attrs, encoding, "_Unsigned") == "true" if masked: encoding["dtype"] = str(_rasterio_to_numpy_dtype(riods.dtypes)) da_name = attrs.pop("NETCDF_VARNAME", default_name) data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper( manager, lock, name=da_name, vrt_params=vrt_params, masked=masked, mask_and_scale=mask_and_scale, unsigned=unsigned, ) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) result = DataArray( data=data, dims=(coord_name, "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding # update attributes from NetCDF attributess _load_netcdf_attrs(riods.tags(), result) result = _decode_datetime_cf( result, decode_times=decode_times, decode_timedelta=decode_timedelta ) # make sure the _FillValue is correct dtype if "_FillValue" in attrs: attrs["_FillValue"] = result.dtype.type(attrs["_FillValue"]) # handle encoding _handle_encoding(result, mask_and_scale, masked, da_name) # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) result.rio.write_transform(_rio_transform(riods), inplace=True) if riods.crs: result.rio.write_crs(riods.crs, inplace=True) if has_gcps: result.rio.write_gcps(*riods.gcps, inplace=True) if chunks is not None: result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result.set_close(manager.close) result.rio._manager = manager # add file path to encoding result.encoding["source"] = riods.name result.encoding["rasterio_dtype"] = str(riods.dtypes[0]) return result