def _prepare_dask(result, riods, filename, chunks): """ Prepare the data for dask computations """ from dask.base import tokenize # augment the token with the file modification time try: mtime = os.path.getmtime(filename) except OSError: # the filename is probably an s3 bucket rather than a regular file mtime = None if chunks in (True, "auto"): import dask from dask.array.core import normalize_chunks if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): msg = ("Automatic chunking requires dask.__version__ >= 0.18.0 . " "You currently have version %s" % dask.__version__) raise NotImplementedError(msg) block_shape = (1, ) + riods.block_shapes[0] chunks = normalize_chunks( chunks=(1, "auto", "auto"), shape=(riods.count, riods.height, riods.width), dtype=riods.dtypes[0], previous_chunks=tuple((c, ) for c in block_shape), ) token = tokenize(filename, mtime, chunks) name_prefix = "open_rasterio-%s" % token return result.chunk(chunks, name_prefix=name_prefix, token=token)
def precomputed_to_dask(store_path: str, key: str, chunks: Union[Sequence[int], str], channel: int = 0): tsa = access_precomputed(store_path, key, mode='r')[ts.d["channel"][channel]] shape = tuple(tsa.shape) dtype = tsa.dtype.numpy_dtype if chunks == "auto": chunks = tsa.spec().to_json()["scale_metadata"]["chunk_size"] _chunks = normalize_chunks(chunks, shape) def chunk_loader(store_path, key, block_info=None): idx = tuple( slice(*idcs) for idcs in block_info[None]["array-location"]) tsa = access_precomputed(store_path, key, mode='r')[ts.d["channel"][channel]] result = tsa[idx].read().result() return result arr = map_blocks(chunk_loader, store_path, key, chunks=_chunks, dtype=dtype) return arr
def create_array(ds_group, column, schema, coordinate=False): codec = numcodecs.Pickle() if schema.dtype == np.object else None zchunks = zarr_chunks(column, schema.dims, schema.chunks) array = ds_group.require_dataset(column, schema.shape, chunks=zchunks, dtype=schema.dtype, object_codec=codec, exact=True) if zchunks is not None: # Expand zarr chunks to full dask resolution # For comparison purposes zchunks = normalize_chunks(array.chunks, schema.shape) if zchunks != schema.chunks: raise ValueError( f"zarr chunks {zchunks} " f"don't match dask chunks {schema.chunks}. " f"This can cause data corruption as described in " f"https://zarr.readthedocs.io/en/stable/tutorial.html" f"#parallel-computing-and-synchronization") array.attrs[DASKMS_ATTR_KEY] = { "dims": schema.dims, "coordinate": coordinate, "array_type": encode_type(schema.type), }
def tri(N, M=None, k=0, dtype=float, chunks="auto", *, like=None): if not _numpy_120 and like is not None: raise RuntimeError("The use of ``like`` required NumPy >= 1.20") _min_int = np.lib.twodim_base._min_int if M is None: M = N chunks = normalize_chunks(chunks, shape=(N, M), dtype=dtype) m = greater_equal( arange(N, chunks=chunks[0][0], dtype=_min_int(0, N), like=like).reshape(1, N).T, arange(-k, M - k, chunks=chunks[1][0], dtype=_min_int(-k, M - k), like=like), ) # Avoid making a copy if the requested type is already bool m = m.astype(dtype, copy=False) return m
def _new_chunks(self, in_arr, rows_per_scan): """Determine a good scan-based chunk size.""" if len(in_arr.shape) != 2: raise ValueError("Can only rechunk 2D arrays for EWA resampling.") if xr is not None and isinstance(in_arr, xr.DataArray): # get the dask or numpy array underneath in_arr = in_arr.data # assume (y, x) num_cols = in_arr.shape[1] prev_chunks = getattr(in_arr, 'chunks', tuple((x, ) for x in in_arr.shape)) num_row_chunks = prev_chunks[0][0] if num_row_chunks % rows_per_scan == 0: row_chunks = num_row_chunks else: row_chunks = 'auto' # what do dask's settings give us for full width chunks auto_chunks = normalize_chunks({ 0: row_chunks, 1: num_cols }, shape=in_arr.shape, dtype=in_arr.dtype, previous_chunks=prev_chunks) # let's make them scan-aligned chunk_rows = max(math.floor(auto_chunks[0][0] / rows_per_scan), 1) * rows_per_scan return {0: chunk_rows, 1: num_cols}
def read(filename, shape, chunks): from dask.highlevelgraph import HighLevelGraph from dask.array.core import normalize_chunks, Array from itertools import product from ...tunable import delayed from numpy import prod, dtype import xmltodict records = scan_file(filename) records = {r["lime_type"]: r for r in records} data_record = records["ildg-binary-data"] data_offset = data_record["pos"] info = xmltodict.parse(records["ildg-format"]["data"])["ildgFormat"] dtype = dtype("complex%d" % (int(info["precision"]) * 2)) assert data_record["data_length"] == prod(shape) * dtype.itemsize normal_chunks = normalize_chunks(chunks, shape=shape) chunks_id = list(product(*[range(len(bd)) for bd in normal_chunks])) reads = [ delayed(read_chunk)(filename, shape, dtype, data_offset, chunks, chunk_id) for chunk_id in chunks_id ] keys = [(filename, *chunk_id) for chunk_id in chunks_id] vals = [read.key for read in reads] dsk = dict(zip(keys, vals)) graph = HighLevelGraph.from_collections(filename, dsk, dependencies=reads) return Array(graph, filename, normal_chunks, dtype=dtype)
def _parse_wrap_args(func, args, kwargs, shape): if isinstance(shape, np.ndarray): shape = shape.tolist() if not isinstance(shape, (tuple, list)): shape = (shape, ) name = kwargs.pop("name", None) chunks = kwargs.pop("chunks", "auto") dtype = kwargs.pop("dtype", None) if dtype is None: dtype = func(shape, *args, **kwargs).dtype dtype = np.dtype(dtype) chunks = normalize_chunks(chunks, shape, dtype=dtype) name = name or funcname(func) + "-" + tokenize(func, shape, chunks, dtype, args, kwargs) return { "shape": shape, "dtype": dtype, "kwargs": kwargs, "chunks": chunks, "name": name, }
def partition_chunking(partition, fragment_rows, chunks): partition_rows = sum(fragment_rows) if chunks is None: # Default to natural chunking determined from individual # parquet files in the dataset row_chunks = tuple(fragment_rows) else: try: partition_chunks = chunks[partition] except IndexError: partition_chunks = chunks[-1] # We only handle row chunking at present, # warn the user unhandled_dims = set(partition_chunks.keys()) - {"row"} if len(unhandled_dims) != 0: warnings.warn( f"{unhandled_dims} chunking dimensions are " f"currently ignored for arrow", UserWarning) # Get any user specified row chunking, defaulting to row_chunks = partition_chunks.get("row", fragment_rows) if isinstance(row_chunks, list): row_chunks = tuple(row_chunks) row_chunks = normalize_chunks(row_chunks, (partition_rows, ))[0] intervals = np.cumsum([0] + fragment_rows) chunk_intervals = np.cumsum((0, ) + row_chunks) ranges = defaultdict(list) it = zip(chunk_intervals, chunk_intervals[1:]) for c, (lower, upper) in enumerate(it): si = np.searchsorted(intervals, lower, side='right') - 1 ei = np.searchsorted(intervals, upper, side='left') if si == ei: raise ValueError("si == ei, arrays may have zero chunks") for s in range(si, ei): e = s + 1 if lower <= intervals[s]: start = 0 else: start = lower - intervals[s] if upper >= intervals[e]: end = intervals[e] - intervals[s] else: end = upper - intervals[s] ranges[c].append((s, (start, end))) return ranges
def mrc_to_dask(urlpath: Pathlike, chunks: Union[str, Sequence[int]], **kwargs): """ Generate a dask array backed by a memory-mapped .mrc file. """ with access_mrc(urlpath, mode="r") as mem: shape, dtype = mrc_shape_dtype_inference(mem) if chunks == "auto": _chunks = normalize_chunks((1, *(-1, ) * (len(shape) - 1)), shape, dtype=dtype) else: _chunks = normalize_chunks(chunks, shape, dtype=dtype) arr = da.map_blocks(mrc_chunk_loader, urlpath, chunks=_chunks, dtype=dtype) return arr
def compute(self, data, cache_id=None, rows_per_scan=None, chunks=None, fill_value=None, weight_count=10000, weight_min=0.01, weight_distance_max=1.0, weight_delta_max=1.0, weight_sum_min=-1.0, maximum_weight_mode=None, **kwargs): """Resample the data according to the precomputed X/Y coordinates.""" # not used in this step kwargs.pop("persist", None) data_in, xr_obj = self._get_input_tuples(data) rows_per_scan = self._get_rows_per_scan(rows_per_scan) data_in = tuple(self._convert_to_dask(data_in, rows_per_scan)) out_chunks = normalize_chunks(chunks or 'auto', shape=self.target_geo_def.shape, dtype=data.dtype) fornav_kwargs = kwargs.copy() maximum_weight_mode = self._handle_mwm(data, maximum_weight_mode) fornav_kwargs.update( dict( weight_count=weight_count, weight_min=weight_min, weight_distance_max=weight_distance_max, weight_delta_max=weight_delta_max, weight_sum_min=weight_sum_min, maximum_weight_mode=maximum_weight_mode, rows_per_scan=rows_per_scan, )) # determine a fill value if they didn't tell us what they have as a # fill value in the numpy arrays if fill_value is None: fill_value = self._get_default_fill(data_in[0]) data_out = [] for data_subarr in data_in: res = self._run_fornav_single(data_subarr, out_chunks, self.target_geo_def, fill_value, **fornav_kwargs) data_out.append(res) if data.ndim == 2: out = data_out[0] else: out = da.concatenate([arr[None, ...] for arr in data_out], axis=0) if xr_obj is not None: dims = [d for d in xr_obj.dims if d not in ('y', 'x')] + ['y', 'x'] out = xr.DataArray(out, attrs=xr_obj.attrs.copy(), dims=dims) out = update_resampled_coords(xr_obj, out, self.target_geo_def) if isinstance(data, np.ndarray): return out.compute() return out
def test_rfftfreq(n, d, c): c = [ci for ci in c(n) if ci != 0] r1 = np.fft.rfftfreq(n, d) r2 = da.fft.rfftfreq(n, d, chunks=c) assert normalize_chunks(c, r2.shape) == r2.chunks assert_eq(r1, r2)
def test_fftfreq(n, d, c): c = c(n) r1 = np.fft.fftfreq(n, d) r2 = da.fft.fftfreq(n, d, chunks=c) assert normalize_chunks(c, r2.shape) == r2.chunks assert_eq(r1, r2)
def test_rfftfreq(n, d, c): c = [ci for ci in c(n) if ci != 0] r1 = np.fft.rfftfreq(n, d) r2 = da.fft.rfftfreq(n, d, chunks=c) assert normalize_chunks(c, r2.shape) == r2.chunks assert_eq(r1, r2)
def test_fftfreq(n, d, c): c = c(n) r1 = np.fft.fftfreq(n, d) r2 = da.fft.fftfreq(n, d, chunks=c) assert normalize_chunks(c, r2.shape) == r2.chunks assert_eq(r1, r2)
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks): """ Test dataset creation """ datasets = read_datasets(ms, select_cols, group_cols, index_cols, chunks=chunks) # (1) Read-only TableProxy # (2) Read-only TAQL TableProxy assert_liveness(2, 1) chans = shapes['chan'] corrs = shapes['corr'] # Expected output chunks echunks = { 'chan': normalize_chunks(chunks.get('chan', chans), shape=(chans, ))[0], 'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0] } for ds in datasets: compute_dict = {} for k, v in ds.data_vars.items(): compute_dict[k] = v.data assert v.dtype == v.data.dtype res = dask.compute(compute_dict)[0] assert res['DATA'].shape[1:] == (chans, corrs) assert 'STATE_ID' in res assert 'TIME' in res chunks = ds.chunks assert chunks["chan"] == echunks['chan'] assert chunks["corr"] == echunks['corr'] dims = ds.dims dims.pop('row') # row changes assert dims == {"chan": shapes['chan'], "corr": shapes['corr']} del ds, datasets, compute_dict, v assert_liveness(0, 0)
def get_group_chunks(group): group_chunks = {} for array in group.values(): array_chunks = normalize_chunks(array.chunks, array.shape) array_dims = decode_attr(array.attrs[DASKMS_ATTR_KEY])["dims"] group_chunks.update(dict(zip(array_dims, array_chunks))) return group_chunks
def indices(dimensions, dtype=int, chunks="auto"): """ Implements NumPy's ``indices`` for Dask Arrays. Generates a grid of indices covering the dimensions provided. The final array has the shape ``(len(dimensions), *dimensions)``. The chunks are used to specify the chunking for axis 1 up to ``len(dimensions)``. The 0th axis always has chunks of length 1. Parameters ---------- dimensions : sequence of ints The shape of the index grid. dtype : dtype, optional Type to use for the array. Default is ``int``. chunks : sequence of ints, str The size of each block. Must be one of the following forms: - A blocksize like (500, 1000) - A size in bytes, like "100 MiB" which will choose a uniform block-like shape - The word "auto" which acts like the above, but uses a configuration value ``array.chunk-size`` for the chunk size Note that the last block will have fewer samples if ``len(array) % chunks != 0``. Returns ------- grid : dask array """ dimensions = tuple(dimensions) dtype = np.dtype(dtype) chunks = normalize_chunks(chunks, shape=dimensions, dtype=dtype) if len(dimensions) != len(chunks): raise ValueError("Need same number of chunks as dimensions.") xi = [] for i in range(len(dimensions)): xi.append(arange(dimensions[i], dtype=dtype, chunks=(chunks[i], ))) grid = [] if all(dimensions): grid = meshgrid(*xi, indexing="ij") if grid: grid = stack(grid) else: grid = empty((len(dimensions), ) + dimensions, dtype=dtype, chunks=(1, ) + chunks) return grid
def test_indices_dimensions_chunks(): chunks = ((1, 4, 2, 3), (5, 5)) darr = da.indices((10, 10), chunks=chunks) assert darr.chunks == ((1, 1), ) + chunks with dask.config.set({"array.chunk-size": "50 MiB"}): shape = (10000, 10000) expected = normalize_chunks("auto", shape=shape, dtype=int) result = da.indices(shape, chunks="auto") # indices prepends a dimension actual = result.chunks[1:] assert expected == actual
def fromfunction(func, chunks="auto", shape=None, dtype=None, **kwargs): dtype = dtype or float chunks = normalize_chunks(chunks, shape, dtype=dtype) inds = tuple(range(len(shape))) arrs = [arange(s, dtype=dtype, chunks=c) for s, c in zip(shape, chunks)] arrs = meshgrid(*arrs, indexing="ij") args = sum(zip(arrs, itertools.repeat(inds)), ()) res = blockwise(func, inds, *args, token="fromfunction", **kwargs) return res
def linspace(start, stop, num=50, chunks=None, dtype=None, endpoint=True): """ Return `num` evenly spaced values over the closed interval [`start`, `stop`]. TODO: implement the `endpoint`, `restep`, and `dtype` keyword args Parameters ---------- start : scalar The starting value of the sequence. stop : scalar The last value of the sequence. num : int, optional Number of samples to include in the returned dask array, including the endpoints. chunks : int The number of samples on each block. Note that the last block will have fewer samples if `num % blocksize != 0` Returns ------- samples : dask array """ num = int(num) if endpoint == False: num = num + 1 if chunks is None: raise ValueError("Must supply a chunks= keyword argument") chunks = normalize_chunks(chunks, (num, )) range_ = stop - start space = float(range_) / (num - 1) name = 'linspace-' + tokenize((start, stop, num, chunks, dtype, endpoint)) dsk = {} blockstart = start for i, bs in enumerate(chunks[0]): blockstop = blockstart + ((bs - 1) * space) task = (partial(np.linspace, dtype=dtype), blockstart, blockstop, bs) blockstart = blockstart + (space * bs) dsk[(name, i)] = task return Array(dsk, name, chunks, dtype=dtype)
def mrc_to_dask(fname: Pathlike, chunks: tuple): """ Generate a dask array backed by a memory-mapped .mrc file """ with access_mrc(fname, mode="r") as mem: shape, dtype = mrc_shape_dtype_inference(mem) chunks_ = normalize_chunks(chunks, shape) def chunk_loader(fname, block_info=None): idx = tuple(slice(*idcs) for idcs in block_info[None]["array-location"]) result = np.array(access_mrc(fname, mode="r").data[idx]).astype(dtype) return result arr = da.map_blocks(chunk_loader, fname, chunks=chunks_, dtype=dtype) return arr
def row_ordering(taql_proxy, index_cols, chunks): nrows = taql_proxy.nrows().result() chunks = normalize_chunks(chunks['row'], shape=(nrows, )) token = dask.base.tokenize(taql_proxy, index_cols, chunks, nrows) name = 'rows-' + token layers = {} start = 0 for i, c in enumerate(chunks[0]): layers[(name, i)] = (_sorted_rows, taql_proxy, start, c) start += c graph = HighLevelGraph.from_collections(name, layers, []) rows = da.Array(graph, name, chunks=chunks, dtype=np.object) row_runs = rows.map_blocks(row_run_factory, sort_dir="read", dtype=np.object) return rows, row_runs
def _group_ordering_arrays(taql_proxy, index_cols, group, group_nrows, group_row_chunks): """ Returns ------- sorted_rows : :class:`dask.array.Array` Sorted table rows chunked on ``group_row_chunks``. row_runs : :class:`dask.array.Array`. Array containing (row_run, resort) tuples. Should not be directly computed. Chunked on ``group_row_chunks``. """ token = dask.base.tokenize(taql_proxy, group, group_nrows) name = 'group-rows-' + token chunks = ((group_nrows, ), ) layers = {(name, 0): (_sorted_group_rows, taql_proxy, group, index_cols)} graph = HighLevelGraph.from_collections(name, layers, []) group_rows = da.Array(graph, name, chunks, dtype=np.int32) group_rows = cached_array(group_rows) try: shape = (group_nrows, ) group_row_chunks = normalize_chunks(group_row_chunks, shape=shape) except ValueError as e: raise GroupChunkingError("%s\n" "Unable to match chunks '%s' " "with shape '%s' for group '%d'. " "This can occur if too few chunk " "dictionaries have been supplied for " "the number of groups " "and an earlier group's chunking strategy " "is applied to a later one." % (str(e), group_row_chunks, shape, group)) group_rows = group_rows.rechunk(group_row_chunks) row_runs = group_rows.map_blocks(row_run_factory, sort_dir="read", dtype=np.object) row_runs = cached_array(row_runs) return group_rows, row_runs
def sobol(size: Union[int, Tuple[int, int]], d0: int = 0, chunks: Chunks2D = None) -> Union[np.ndarray, da.Array]: """Sobol points generator based on Gray code order :param size: number of samples (cannot be greater than :math:`2^{32}`) to extract from a single dimension, or tuple (samples, dimensions). To guarantee uniform distribution, the number of samples should always be :math:`2^{n} - 1`. :param int d0: first dimension. This can be used as a functional equivalent of a a random seed. dimensions + d0 can't be greater than :func:`max_dimensions()` - 1. :param chunks: If None, return a numpy array. If set, return a dask array with the given chunk size. It can be anything accepted by dask (a positive integer, a tuple of two ints, or a tuple of two tuples of ints) for the output shape (see result below). e.g. either ``(16384, 50)`` or ``((16384, 16383), (50, 50, 50))`` could be used together with ``size=(32767, 150)``. .. note:: The algorithm is not efficient if there are multiple chunks on axis 0. However, if you do need them, it is typically better to require them here than re-chunking afterwards, particularly if (most of) the subsequent algorithm is embarassingly parallel. :returns: If size is an int, a 1-dimensional array of samples. If size is a tuple, a 2-dimensional array POINTS, where ``POINTS[i, j]`` is the ith sample of the jth dimension. Each dimension is a uniform (0, 1) distribution. :rtype: If chunks is not None, :class:`dask.array.Array`; else :class:`numpy.ndarray` """ if isinstance(size, int): samples = size dimensions = 1 else: samples, dimensions = size if not 0 < samples < 2**32: raise ValueError("samples must be between 1 and 2^32") if not 0 < dimensions + d0 <= max_dimensions(): raise ValueError("(dimensions + d0) must be between 1 and %d" % max_dimensions()) if chunks is None: res = _sobol_kernel(samples, dimensions, 0, d0) if isinstance(size, int): res = res[:, 0] return res # dask-specific code chunks = cast(NormalizedChunks2D, normalize_chunks(chunks, shape=(samples, dimensions))) name = "sobol-%d-%d-%d" % (samples, dimensions, d0) dsk = {} offset_i = 0 for i, size_i in enumerate(chunks[0]): offset_j = 0 for j, size_j in enumerate(chunks[1]): dsk[name, i, j] = (_sobol_kernel, size_i, size_j, offset_i, d0 + offset_j) offset_j += size_j offset_i += size_i res = da.Array(dsk, name=name, dtype=float, chunks=chunks) if isinstance(size, int): res = res[:, 0] return res
def open_rasterio( filename, parse_coordinates=None, chunks=None, cache=None, lock=None, masked=False, **open_kwargs ): """Open a file with rasterio (experimental). This should work with any file that rasterio can open (most often: geoTIFF). The x and y coordinates are generated automatically from the file's geoinformation, shifted to the center of each pixel (see `"PixelIsArea" Raster Space <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_ for more information). You can generate 2D coordinates from the file's attributes with:: from affine import Affine da = xr.open_rasterio('path_to_file.tif') transform = Affine.from_gdal(*da.attrs['transform']) nx, ny = da.sizes['x'], da.sizes['y'] x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform Parameters ---------- filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT Path to the file to open. Or already open rasterio dataset. parse_coordinates : bool, optional Whether to parse the x and y coordinates out of the file's ``transform`` attribute or not. The default is to automatically parse the coordinates only if they are rectilinear (1D). It can be useful to set ``parse_coordinates=False`` if your files are very large or if you don't need the coordinates. chunks : int, tuple or dict, optional Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new DataArray into a dask array. Chunks can also be set to ``True`` or ``"auto"`` to choose sensible chunk sizes according to ``dask.config.get("array.chunk-size"). cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. lock : False, True or threading.Lock, optional If chunks is provided, this argument is passed on to :py:func:`dask.array.from_array`. By default, a global lock is used to avoid issues with concurrent access to the same file when using dask's multithreaded backend. masked : bool, optional If True, read the mask and to set values to NaN. Defaults to False. **open_kwargs: kwargs, optional Optional keyword arguments to pass into rasterio.open(). Returns ------- data : DataArray The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates import rasterio from rasterio.vrt import WarpedVRT vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name elif isinstance(filename, rasterio.vrt.WarpedVRT): vrt = filename filename = vrt.src_dataset.name vrt_params = dict( crs=vrt.crs.to_string(), resampling=vrt.resampling, src_nodata=vrt.src_nodata, dst_nodata=vrt.dst_nodata, tolerance=vrt.tolerance, transform=vrt.transform, width=vrt.width, height=vrt.height, warp_extras=vrt.warp_extras, ) if lock is None: lock = RASTERIO_LOCK # ensure default for sharing is False # ref https://github.com/mapbox/rasterio/issues/1504 open_kwargs["sharing"] = open_kwargs.get("sharing", False) manager = CachingFileManager( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() # open the subdatasets if they exist if riods.subdatasets: data_arrays = {} for iii, subdataset in enumerate(riods.subdatasets): rioda = open_rasterio( subdataset, parse_coordinates=iii == 0 and parse_coordinates, chunks=chunks, cache=cache, lock=lock, masked=masked, ) data_arrays[rioda.name] = rioda return Dataset(data_arrays) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) if cache is None: cache = chunks is None coords = OrderedDict() # Get bands if riods.count < 1: raise ValueError("Unknown dims") coords["band"] = np.asarray(riods.indexes) # Get coordinates if LooseVersion(rasterio.__version__) < LooseVersion("1.0"): transform = riods.affine else: transform = riods.transform if transform.is_rectilinear and parse_coordinates: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: # 2d coordinates warnings.warn( "The file coordinates' transformation isn't " "rectilinear: xarray won't parse the coordinates " "in this case. Set `parse_coordinates=False` to " "suppress this warning.", RuntimeWarning, stacklevel=3, ) # Attributes attrs = _parse_tags(riods.tags(1)) encoding = dict() # Affine transformation matrix (always available) # This describes coefficients mapping pixel coordinates to CRS # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) attrs["transform"] = tuple(transform)[:6] if hasattr(riods, "nodata") and riods.nodata is not None: # The nodata values for the raster bands if masked: encoding["_FillValue"] = riods.nodata else: attrs["_FillValue"] = riods.nodata if hasattr(riods, "scales"): # The scale values for the raster bands attrs["scales"] = riods.scales if hasattr(riods, "offsets"): # The offset values for the raster bands attrs["offsets"] = riods.offsets if hasattr(riods, "descriptions") and any(riods.descriptions): # Descriptions for each dataset band attrs["descriptions"] = riods.descriptions if hasattr(riods, "units") and any(riods.units): # A list of units string for each dataset band attrs["units"] = riods.units # Parse extra metadata from tags, if supported parsers = {"ENVI": _parse_envi} driver = riods.driver if driver in parsers: meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, # as attributes otherwise if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: coords[k] = ("band", np.asarray(v)) else: attrs[k] = v data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) if cache and chunks is None: data = indexing.MemoryCachedArray(data) da_name = attrs.pop("NETCDF_VARNAME", None) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name ) result.encoding = encoding if hasattr(riods, "crs") and riods.crs: result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: from dask.base import tokenize # augment the token with the file modification time try: mtime = os.path.getmtime(filename) except OSError: # the filename is probably an s3 bucket rather than a regular file mtime = None if chunks in (True, "auto"): from dask.array.core import normalize_chunks import dask if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): msg = ( "Automatic chunking requires dask.__version__ >= 0.18.0 . " "You currently have version %s" % dask.__version__ ) raise NotImplementedError(msg) block_shape = (1,) + riods.block_shapes[0] chunks = normalize_chunks( chunks=(1, "auto", "auto"), shape=(riods.count, riods.height, riods.width), dtype=riods.dtypes[0], previous_chunks=tuple((c,) for c in block_shape), ) token = tokenize(filename, mtime, chunks) name_prefix = "open_rasterio-%s" % token result = result.chunk(chunks, name_prefix=name_prefix, token=token) # Make the file closeable result._file_obj = manager return result
def choice(self, a, size=None, replace=True, p=None, chunks="auto"): dependencies = [] # Normalize and validate `a` if isinstance(a, Integral): # On windows the output dtype differs if p is provided or # absent, see https://github.com/numpy/numpy/issues/9867 dummy_p = np.array([1]) if p is not None else p dtype = np.random.choice(1, size=(), p=dummy_p).dtype len_a = a if a < 0: raise ValueError("a must be greater than 0") else: a = asarray(a) a = a.rechunk(a.shape) dtype = a.dtype if a.ndim != 1: raise ValueError("a must be one dimensional") len_a = len(a) dependencies.append(a) a = a.__dask_keys__()[0] # Normalize and validate `p` if p is not None: if not isinstance(p, Array): # If p is not a dask array, first check the sum is close # to 1 before converting. p = np.asarray(p) if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): raise ValueError("probabilities do not sum to 1") p = asarray(p) else: p = p.rechunk(p.shape) if p.ndim != 1: raise ValueError("p must be one dimensional") if len(p) != len_a: raise ValueError("a and p must have the same size") dependencies.append(p) p = p.__dask_keys__()[0] if size is None: size = () elif not isinstance(size, (tuple, list)): size = (size, ) chunks = normalize_chunks(chunks, size, dtype=np.float64) if not replace and len(chunks[0]) > 1: err_msg = ("replace=False is not currently supported for " "dask.array.choice with multi-chunk output " "arrays") raise NotImplementedError(err_msg) sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) name = "da.random.choice-%s" % tokenize(state_data, size, chunks, a, replace, p) keys = product([name], *(range(len(bd)) for bd in chunks)) dsk = { k: (_choice, state, a, size, replace, p) for k, state, size in zip(keys, state_data, sizes) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks, dtype=dtype)
def groupby_agg( array: dask.array.Array, by: dask.array.Array, agg: Aggregation, expected_groups: Optional[Union[Sequence, np.ndarray]], axis: Sequence = None, split_out: int = 1, fill_value: Any = None, ) -> Tuple[dask.array.Array, Union[np.ndarray, dask.array.Array]]: # I think _tree_reduce expects this assert isinstance(axis, Sequence) assert all(ax >= 0 for ax in axis) inds = tuple(range(array.ndim)) name = f"groupby_{agg.name}" token = dask.base.tokenize(array, by, agg, expected_groups, axis, split_out) # This is necessary for argreductions. # We need to rechunk before zipping up with the index # let's always do it anyway _, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim:]) # preprocess the array if agg.preprocess: array = agg.preprocess(array, axis=axis) # apply reduction on chunk applied = dask.array.blockwise( partial( _get_chunk_reduction(agg.reduction_type), func=agg.chunk, # type: ignore axis=axis, # with the current implementation we want reindexing at the blockwise step # only reindex to groups present at combine stage expected_groups=expected_groups if split_out > 1 else None, fill_value=agg.fill_value, ), inds, array, inds, by, inds[-by.ndim:], concatenate=False, dtype=array.dtype, meta=array._meta, align_arrays=False, token=f"{name}-chunk-{token}", ) if split_out > 1: if expected_groups is None: # This could be implemented using the "hash_split" strategy # from dask.dataframe raise NotImplementedError chunk_tuples = tuple( itertools.product(*tuple(range(n) for n in applied.numblocks))) ngroups = len(expected_groups) group_chunks = normalize_chunks(np.ceil(ngroups / split_out), (ngroups, ))[0] idx = tuple(np.cumsum((0, ) + group_chunks)) # split each block into `split_out` chunks dsk = {} split_name = f"{name}-split-{token}" for i in chunk_tuples: for j in range(split_out): dsk[(split_name, *i, j)] = ( _split_groups, (applied.name, *i), j, slice(idx[j], idx[j + 1]), ) # now construct an array that can be passed to _tree_reduce intergraph = HighLevelGraph.from_collections(split_name, dsk, dependencies=(applied, )) intermediate = dask.array.Array( intergraph, name=split_name, chunks=applied.chunks + ((1, ) * split_out, ), meta=array._meta, ) expected_agg = None else: intermediate = applied group_chunks = (len(expected_groups), ) if expected_groups is not None else (np.nan, ) expected_agg = expected_groups # reduced is really a dict mapping reduction name to array # and "groups" to an array of group labels # Note: it does not make sense to interpret axis relative to # shape of intermediate results after the blockwise call reduced = dask.array.reductions._tree_reduce( intermediate, aggregate=partial( _npg_aggregate, agg=agg, expected_groups=expected_agg, group_ndim=by.ndim, fill_value=fill_value, ), combine=partial(_npg_combine, agg=agg, group_ndim=by.ndim), name=f"{name}-reduce", dtype=array.dtype, axis=axis, keepdims=True, concatenate=False, ) output_chunks = reduced.chunks[:-(len(axis) + int(split_out > 1))] + ( group_chunks, ) def _getitem(d, key1, key2): return d[key1][key2] # extract results from the dict result: Dict = {} layer: Dict[Tuple, Tuple] = {} ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks) if expected_groups is None: groups_name = f"groups-{name}-{token}" # we've used keepdims=True, so _tree_reduce preserves some dummy dimensions first_block = len(ochunks) * (0, ) layer[(groups_name, *first_block)] = ( operator.getitem, (reduced.name, *first_block), "groups", ) groups = (dask.array.Array( HighLevelGraph.from_collections(groups_name, layer, dependencies=[reduced]), groups_name, chunks=(group_chunks, ), dtype=by.dtype, ), ) else: groups = (expected_groups, ) layer: Dict[Tuple, Tuple] = {} # type: ignore agg_name = f"{name}-{token}" for ochunk in itertools.product(*ochunks): inchunk = ochunk[:-1] + (0, ) * (len(axis)) + (ochunk[-1], ) * int( split_out > 1) layer[(agg_name, *ochunk)] = ( operator.getitem, (reduced.name, *inchunk), agg.name, ) result = dask.array.Array( HighLevelGraph.from_collections(agg_name, layer, dependencies=[reduced]), agg_name, chunks=output_chunks, dtype=agg.dtype if agg.dtype else array.dtype, ) return (result, *groups)
def _wrap(self, func, *args, **kwargs): """ Wrap numpy random function to produce dask.array random function extra_chunks should be a chunks tuple to append to the end of chunks """ size = kwargs.pop('size', None) chunks = kwargs.pop('chunks') extra_chunks = kwargs.pop('extra_chunks', ()) if size is not None and not isinstance(size, (tuple, list)): size = (size, ) args_shapes = { ar.shape for ar in args if isinstance(ar, (Array, np.ndarray)) } args_shapes.union({ ar.shape for ar in kwargs.values() if isinstance(ar, (Array, np.ndarray)) }) shapes = list(args_shapes) if size is not None: shapes += [size] # broadcast to the final size(shape) size = broadcast_shapes(*shapes) chunks = normalize_chunks(chunks, size) slices = slices_from_chunks(chunks) def _broadcast_any(ar, shape, chunks): if isinstance(ar, Array): return broadcast_to(ar, shape).rechunk(chunks) if isinstance(ar, np.ndarray): return np.ascontiguousarray(np.broadcast_to(ar, shape)) # Broadcast all arguments, get tiny versions as well # Start adding the relevant bits to the graph dsk = {} dsks = [] lookup = {} small_args = [] for i, ar in enumerate(args): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dsks.append(res.dask) lookup[i] = res.name elif isinstance(res, np.ndarray): name = 'array-{}'.format(tokenize(res)) lookup[i] = name dsk[name] = res small_args.append(ar[tuple(0 for _ in ar.shape)]) else: small_args.append(ar) small_kwargs = {} for key, ar in kwargs.items(): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dsks.append(res.dask) lookup[key] = res.name elif isinstance(res, np.ndarray): name = 'array-{}'.format(tokenize(res)) lookup[key] = name dsk[name] = res small_kwargs[key] = ar[tuple(0 for _ in ar.shape)] else: small_kwargs[key] = ar # Get dtype small_kwargs['size'] = (0, ) dtype = func(xoroshiro128plus.RandomState(), *small_args, **small_kwargs).dtype sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) token = tokenize(state_data, size, chunks, args, kwargs) name = 'da.random.{0}-{1}'.format(func.__name__, token) keys = product([name], *([range(len(bd)) for bd in chunks] + [[0]] * len(extra_chunks))) blocks = product(*[range(len(bd)) for bd in chunks]) vals = [] for state, size, slc, block in zip(state_data, sizes, slices, blocks): arg = [] for i, ar in enumerate(args): if i not in lookup: arg.append(ar) else: if isinstance(ar, Array): arg.append((lookup[i], ) + block) else: # np.ndarray arg.append((getitem, lookup[i], slc)) kwrg = {} for k, ar in kwargs.items(): if k not in lookup: kwrg[k] = ar else: if isinstance(ar, Array): kwrg[k] = (lookup[k], ) + block else: # np.ndarray kwrg[k] = (getitem, lookup[k], slc) vals.append((_apply_random, func.__name__, state, size, arg, kwrg)) dsk.update(dict(zip(keys, vals))) dsk = sharedict.merge((name, dsk), *dsks) return Array(dsk, name, chunks + extra_chunks, dtype=dtype)
def choice(self, a, size=None, replace=True, p=None, chunks=None): dsks = [] # Normalize and validate `a` if isinstance(a, Integral): # On windows the output dtype differs if p is provided or # absent, see https://github.com/numpy/numpy/issues/9867 dummy_p = np.array([1]) if p is not None else p dtype = np.random.choice(1, size=(), p=dummy_p).dtype len_a = a if a < 0: raise ValueError("a must be greater than 0") else: a = asarray(a).rechunk(a.shape) dtype = a.dtype if a.ndim != 1: raise ValueError("a must be one dimensional") len_a = len(a) dsks.append(a.dask) a = a.__dask_keys__()[0] # Normalize and validate `p` if p is not None: if not isinstance(p, Array): # If p is not a dask array, first check the sum is close # to 1 before converting. p = np.asarray(p) if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): raise ValueError("probabilities do not sum to 1") p = asarray(p) else: p = p.rechunk(p.shape) if p.ndim != 1: raise ValueError("p must be one dimensional") if len(p) != len_a: raise ValueError("a and p must have the same size") dsks.append(p.dask) p = p.__dask_keys__()[0] if size is None: size = () elif not isinstance(size, (tuple, list)): size = (size, ) chunks = normalize_chunks(chunks, size) sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) name = 'da.random.choice-%s' % tokenize(state_data, size, chunks, a, replace, p) keys = product([name], *(range(len(bd)) for bd in chunks)) dsk = { k: (_choice, state, a, size, replace, p) for k, state, size in zip(keys, state_data, sizes) } return Array(sharedict.merge((name, dsk), *dsks), name, chunks, dtype=dtype)
def _wrap(self, funcname, *args, size=None, chunks="auto", extra_chunks=(), **kwargs): """Wrap numpy random function to produce dask.array random function extra_chunks should be a chunks tuple to append to the end of chunks """ if size is not None and not isinstance(size, (tuple, list)): size = (size, ) shapes = list({ ar.shape for ar in chain(args, kwargs.values()) if isinstance(ar, (Array, np.ndarray)) }) if size is not None: shapes.append(size) # broadcast to the final size(shape) size = broadcast_shapes(*shapes) chunks = normalize_chunks( chunks, size, # ideally would use dtype here dtype=kwargs.get("dtype", np.float64), ) slices = slices_from_chunks(chunks) def _broadcast_any(ar, shape, chunks): if isinstance(ar, Array): return broadcast_to(ar, shape).rechunk(chunks) if isinstance(ar, np.ndarray): return np.ascontiguousarray(np.broadcast_to(ar, shape)) # Broadcast all arguments, get tiny versions as well # Start adding the relevant bits to the graph dsk = {} lookup = {} small_args = [] dependencies = [] for i, ar in enumerate(args): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[i] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[i] = name dsk[name] = res small_args.append(ar[tuple(0 for _ in ar.shape)]) else: small_args.append(ar) small_kwargs = {} for key, ar in kwargs.items(): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[key] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[key] = name dsk[name] = res small_kwargs[key] = ar[tuple(0 for _ in ar.shape)] else: small_kwargs[key] = ar sizes = list(product(*chunks)) seeds = random_state_data(len(sizes), self._numpy_state) token = tokenize(seeds, size, chunks, args, kwargs) name = f"{funcname}-{token}" keys = product([name], *([range(len(bd)) for bd in chunks] + [[0]] * len(extra_chunks))) blocks = product(*[range(len(bd)) for bd in chunks]) vals = [] for seed, size, slc, block in zip(seeds, sizes, slices, blocks): arg = [] for i, ar in enumerate(args): if i not in lookup: arg.append(ar) else: if isinstance(ar, Array): arg.append((lookup[i], ) + block) else: # np.ndarray arg.append((getitem, lookup[i], slc)) kwrg = {} for k, ar in kwargs.items(): if k not in lookup: kwrg[k] = ar else: if isinstance(ar, Array): kwrg[k] = (lookup[k], ) + block else: # np.ndarray kwrg[k] = (getitem, lookup[k], slc) vals.append((_apply_random, self._RandomState, funcname, seed, size, arg, kwrg)) meta = _apply_random( self._RandomState, funcname, seed, (0, ) * len(size), small_args, small_kwargs, ) dsk.update(dict(zip(keys, vals))) graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks + extra_chunks, meta=meta)
def chunks(self): return normalize_chunks(CHUNKS, shape=self.shape)
def _copula_impl( cov: Union[List[List[float]], np.ndarray], df: Union[None, int, List[int], np.ndarray], samples: int, seed: int, chunks: Chunks2D, rng: str, ) -> Union[np.ndarray, da.Array]: """Implementation of gaussian_copula and t_copula """ samples = int(samples) if samples <= 0: raise ValueError("Number of samples must be positive") cov = np.asarray(cov) if cov.ndim != 2 or cov.shape[0] != cov.shape[1]: raise ValueError("cov must be a square matrix") dimensions = cov.shape[0] L = numpy.linalg.cholesky(cov) if chunks is not None: chunks = cast( NormalizedChunks2D, normalize_chunks(chunks, shape=(samples, dimensions)) ) L = da.from_array(L, chunks=(chunks[1], chunks[1])) rng = rng.lower() if rng == "mersenne twister": # When pulling samples from the Mersenne Twister generator, we have # the samples on the rows. This guarantees that if we draw more # samples, the original samples won't change. rnd_state_y = duck.RandomState(seed) y = rnd_state_y.standard_normal(size=(samples, dimensions), chunks=chunks) elif rng == "sobol": y = sobol(size=(samples, dimensions), d0=seed, chunks=chunks) y = duck.norm_ppf(y) else: raise ValueError(f"Unknown rng: {rng}") p = (L @ y.T).T # Gaussian Copula if df is None: return p # Pre-process df into a 1D numpy/dask array df = np.asarray(df) if (df <= 0).any(): raise ValueError("df must always be greater than zero") if df.shape not in ((), (dimensions,)): raise ValueError( "df must be either a scalar or a 1D vector with as " "many points as the width of the correlation matrix" ) if df.ndim == 1 and chunks is not None: df = da.from_array(df, chunks=(chunks[1],)) # Define chunks for the S chi-square matrix chunks_r = (chunks[0], (1,)) if chunks else None if rng == "mersenne twister": # Use two separate random states for the normal and the chi2 # distributions. This is NOT the same as just extracting two series # from the same RandomState, as we must guarantee that, if you extract # a different number of samples from the generator, the initial # samples must remain the same. # Don't just do seed + 1 as that would have unwanted repercussions # when one tries to extract different series from different seeds. seed_r = (seed + 190823761298456) % 2 ** 32 rnd_state_r = duck.RandomState(seed_r) r = rnd_state_r.uniform(size=(samples, 1), chunks=chunks_r) elif rng == "sobol": seed_r = seed + dimensions r = sobol(size=(samples, 1), d0=seed_r, chunks=chunks_r) else: assert False # pragma: nocover s = duck.chi2_ppf(r, df) z = duck.sqrt(df / s) * p # Convert t distribution to normal (0, 1) u = duck.t_cdf(z, df) t = duck.norm_ppf(u) return t