def test_from_dask_array_unknown_chunks(): # Series dx = da.Array( { ("x", 0): np.arange(5), ("x", 1): np.arange(5, 11) }, "x", ((np.nan, np.nan), ), np.arange(1).dtype, ) df = dd.from_dask_array(dx) assert isinstance(df, dd.Series) assert not df.known_divisions assert_eq(df, pd.Series(np.arange(11)), check_index=False) # DataFrame dsk = { ("x", 0, 0): np.random.random((2, 3)), ("x", 1, 0): np.random.random((5, 3)) } dx = da.Array(dsk, "x", ((np.nan, np.nan), (3, )), np.float64) df = dd.from_dask_array(dx) assert isinstance(df, dd.DataFrame) assert not df.known_divisions assert_eq(df, pd.DataFrame(dx.compute()), check_index=False) # Unknown width dx = da.Array(dsk, "x", ((np.nan, np.nan), (np.nan, )), np.float64) with pytest.raises(ValueError): df = dd.from_dask_array(dx)
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [ lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None] ] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote, = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
async def test_dask_array_collections(c, s, a, b): import dask.array as da s.validate = False x_dsk = {("x", i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {("y", i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = await c.scatter(x_dsk) y_futures = await c.scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, "x", ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, "y", ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, "x", ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, "y", ((3, 3), (3, 3, 3)), dt) exprs = [ lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None], ] for expr in exprs: local = expr(x_local, y_local).compute(scheduler="sync") remote = c.compute(expr(x_remote, y_remote)) remote = await remote assert np.all(local == remote)
def test__dask_array_collections(c, s, a, b): import dask.array as da x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield c._scatter(x_dsk) y_futures = yield c._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local).compute(scheduler='sync') remote = c.compute(expr(x_remote, y_remote)) remote = yield remote assert np.all(local == remote)
def __read_template_as_dask(dd, tcPerf): """ Read template binary data and return as a dask array """ t, y, x = dd.tdef.length(), dd.ydef.length(), dd.xdef.length() totalNum = sum([ reduce(lambda x, y: x * y, (tcPerf[0], v.zcount, y, x)) for v in dd.vdef ]) # print(totalNum * 4.0 / 1024.0 / 1024.0) binData = [] dtype = '<f4' if dd.byteOrder == 'little' else '>f4' for m, v in enumerate(dd.vdef): if totalNum > (200 * 100 * 100 * 100): # about 800 MB, chunk 2D slice # print('large') chunk = (1, 1, y, x) shape = (t, v.zcount, y, x) dsk = {(v.name + '_@miniufo', l + sum(tcPerf[:m]), k, 0, 0): (__read_var, f, v, dd.tRecLength, l, k, dtype) for m, f in enumerate(dd.dsetPath[:len(tcPerf)]) for l in range(tcPerf[m]) for k in range(v.zcount)} binData.append( dsa.Array(dsk, v.name + '_@miniufo', chunk, dtype=dtype, shape=shape)) else: # in between, chunk 3D slice # print('between') chunk = (1, v.zcount, y, x) shape = (t, v.zcount, y, x) dsk = {(v.name + '_@miniufo', l + sum(tcPerf[:m]), 0, 0, 0): (__read_var, f, v, dd.tRecLength, l, None, dtype) for m, f in enumerate(dd.dsetPath[:len(tcPerf)]) for l in range(tcPerf[m])} binData.append( dsa.Array(dsk, v.name + '_@miniufo', chunk, dtype=dtype, shape=shape)) return binData
def stream_reduction(time_index, antenna1, antenna2, dde1_jones, source_coh, dde2_jones, predict_check_tup, out_dtype, streams): """ Reduces source coherencies + ddes over the source dimension in ``N`` parallel streams. This is accomplished by calling predict_vis on on ddes and source coherencies to produce visibilities which are passed into the `base_vis` argument of ``predict_vis`` for the next chunk. """ # Unique name and token for this operation token = tokenize(time_index, antenna1, antenna2, dde1_jones, source_coh, dde2_jones, streams) name = 'stream-coherency-reduction-' + token # Number of dim blocks blocks = _extract_blocks(time_index, dde1_jones, source_coh, dde2_jones) (src_blocks, row_blocks, _, chan_blocks), corr_blocks = blocks[:4], blocks[4:] # Total number of other dimension blocks nblocks = reduce(mul, (row_blocks, chan_blocks) + corr_blocks, 1) # Create the compressed mapping layers = CoherencyStreamReduction(time_index, antenna1, antenna2, dde1_jones, source_coh, dde2_jones, name, streams) # Create the graph extra_deps = [a for a in (dde1_jones, source_coh, dde2_jones) if a is not None] deps = [time_index, antenna1, antenna2] + extra_deps graph = HighLevelGraph.from_collections(name, layers, deps) chunks = ((1,) * src_blocks, (1,)*nblocks) # This should never be directly computed, reported chunks # and dtype don't match the actual data. We create it # because it makes chaining HighLevelGraphs easier stream_reduction = da.Array(graph, name, chunks, dtype=np.int8) name = "coherency-reduction-" + tokenize(stream_reduction) layers = CoherencyFinalReduction(name, layers) graph = HighLevelGraph.from_collections(name, layers, [stream_reduction]) chunks = _extract_chunks(time_index, dde1_jones, source_coh, dde2_jones) return da.Array(graph, name, chunks[1:], dtype=out_dtype)
def inlined_array(a, inline_arrays=None): """ Flatten underlying graph """ agraph = a.__dask_graph__() akeys = set(flatten(a.__dask_keys__())) # Inline everything except the output keys if inline_arrays is None: inline_keys = set(agraph.keys()) - akeys dsk2 = inline(agraph, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) graph = HighLevelGraph.from_collections(a.name, dsk3, []) return da.Array(graph, a.name, a.chunks, dtype=a.dtype) # We're given specific arrays to inline, promote to list if isinstance(inline_arrays, da.Array): inline_arrays = [inline_arrays] elif isinstance(inline_arrays, tuple): inline_arrays = list(inline_arrays) if not isinstance(inline_arrays, list): raise TypeError("Invalid inline_arrays, must be " "(None, list, tuple, dask.array.Array)") inline_names = set(a.name for a in inline_arrays) layers = agraph.layers.copy() deps = {k: v.copy() for k, v in agraph.dependencies.items()} # We want to inline layers that depend on the inlined arrays inline_layers = set(k for k, v in deps.items() if len(inline_names.intersection(v)) > 0) for layer_name in inline_layers: dsk = dict(layers[layer_name]) layer_keys = set(dsk.keys()) inline_keys = set() for array in inline_arrays: dsk.update(layers[array.name]) deps.pop(array.name, None) deps[layer_name].discard(array.name) inline_keys.update(layers[array.name].keys()) dsk2 = inline(dsk, keys=inline_keys, inline_constants=True) layers[layer_name], _ = cull(dsk2, layer_keys) # Remove layers containing the inlined arrays for inline_name in inline_names: layers.pop(inline_name) return da.Array(HighLevelGraph(layers, deps), a.name, a.chunks, a.dtype)
def inlined_array(a, inline_arrays=None): """ Flatten underlying graph """ agraph = a.__dask_graph__() akeys = set(flatten(a.__dask_keys__())) # Inline everything except the output keys if inline_arrays is None: inline_keys = set(agraph.keys()) - akeys dsk2 = inline(agraph, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) graph = HighLevelGraph.from_collections(a.name, dsk3, []) return da.Array(graph, a.name, a.chunks, dtype=a.dtype) # We're given specific arrays to inline, promote to list if isinstance(inline_arrays, da.Array): inline_arrays = [inline_arrays] elif isinstance(inline_arrays, tuple): inline_arrays = list(inline_arrays) if not isinstance(inline_arrays, list): raise TypeError("Invalid inline_arrays, must be " "(None, list, tuple, dask.array.Array)") layers = agraph.layers.copy() deps = agraph.dependencies.copy() inline_keys = set() dsk = dict(layers[a.name]) # Inline specified arrays for array in inline_arrays: # Remove array from layers and dependencies try: dsk.update(layers.pop(array.name)) del deps[array.name] except KeyError: raise ValueError("%s is not a valid dependency of a" % array.name) # Record keys to inline inline_keys.update(flatten(array.__dask_keys__())) dsk2 = inline(dsk, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) layers[a.name] = dsk3 graph = HighLevelGraph(layers, deps) return da.Array(graph, a.name, a.chunks, a.dtype)
def test_write_dict_data(tmp_path, chunks, dtype): rs = np.random.RandomState(42) row_sum = 0 def _vis_factory(chan, corr): # Variably sized-channels per row, as in BDA data nchan = rs.randint(chan) return (rs.normal(size=(1, nchan, corr)) + rs.normal(size=(1, nchan, corr))*1j) shapes = {k: sum(c) for k, c in chunks.items()} row_sum += shapes['row'] # assert len(chunks['chan']) == 1 assert len(chunks['corr']) == 1 # Make some visibilities dims = ("row", "chan", "corr") row, chan, corr = (shapes[d] for d in dims) name = "vis-data-" + uuid.uuid4().hex nchunks = (len(chunks[d]) for d in dims) keys = product((name,), *(range(c) for c in nchunks)) chunk_sizes = product(*(chunks[d] for d in dims)) layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr) for i in range(r)} for k, (r, _, _) in zip(keys, chunk_sizes)} hlg = HighLevelGraph.from_collections(name, layer, []) chunks = tuple(chunks[d] for d in dims) meta = np.empty((0,)*len(chunks), dtype=np.complex128) vis = da.Array(hlg, name, chunks, meta=meta) ds = Dataset({"DATA": (dims, vis)}) table_name = os.path.join(str(tmp_path), 'test.table') writes, table_proxy = write_datasets(table_name, ds, ["DATA"], table_proxy=True, # No fixed shape columns descriptor="ms(False)") dask.compute(writes) data = table_proxy.getvarcol("DATA").result() # First row chunk assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4']) assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5']) # Second row chunk assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7']) assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8']) # Third row chunk assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9']) assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
def _da_from_mem( token: Delayed, shape: ShapeLike, dtype: DtypeLike, chunks: Tuple[int, ...], name: str = "from_mem", ) -> da.Array: """ Construct dask view of some yet to be computed in RAM store. :param token: Should evaluate to either Token or string key in to the Cache, which is expected to contain ``numpy`` array of supplied ``shape`` and ``dtype`` :param shape: Expected shape of the future array :param dtype: Expected dtype of the future array :param chunks: Tuple of integers describing chunk partitioning for output array :param name: Dask name Gotchas ======= - Output array can not be moved from one worker to another. - Works with in-process Client - Works with single worker cluster - Can work if scheduler is told to schedule this on a single worker - Cache life cycle management can be tough. If token evaluates to a ``Token`` object then automatic cache cleanup should happen when output array is destroyed. If it is just a string, then it's up to caller to ensure that there is cleanup and no use after free. Returns ======= Dask Array """ if not isinstance(shape, tuple): shape = (shape, ) assert dask.is_dask_collection(token) assert len(shape) == len(chunks) _chunks = unpack_chunks(chunks, shape) _rois = [tuple(_roi_from_chunks(ch)) for ch in _chunks] _roi = lambda idx: tuple(_rois[i][k] for i, k in enumerate(idx)) shape_in_chunks = tuple(len(ch) for ch in _chunks) dsk = {} name = randomize(name) for idx in np.ndindex(shape_in_chunks): dsk[(name, *idx)] = (_chunk_extractor, token.key, _roi(idx)) dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[token]) return da.Array(dsk, name, shape=shape, dtype=dtype, chunks=_chunks)
def test_gh_4176(): with warnings.catch_warnings(): warnings.simplefilter('ignore') from dask.sharedict import ShareDict def foo(A): return A[None, ...] A = da.ones(shape=(10, 20, 4), chunks=(2, 5, 4)) name = 'D' dsk = blockwise(foo, name, ("nsrc", "ntime", "nbl", "npol"), A.name, ("ntime", "nbl", "npol"), new_axes={"nsrc": 1}, numblocks={a.name: a.numblocks for a in (A, )}) array_dsk = ShareDict() array_dsk.update(dsk) array_dsk.update(A.__dask_graph__()) chunks = ((1, ), ) + A.chunks D = da.Array(array_dsk, name, chunks, dtype=A.dtype) D.sum(axis=0).compute()
def get_dask_array(self, array_name, chunks, dtype, offset=()): """Get dask array from the store. Any missing chunks are replaced with zeros, suppressing any :exc:`ChunkNotFound` errors. Parameters ---------- array_name : string Identifier of array in chunk store chunks : tuple of tuples of ints Chunk specification dtype : :class:`numpy.dtype` object or equivalent Data type of array offset : tuple of int, optional Offset to add to each dimension when addressing chunks in store Returns ------- array : :class:`dask.array.Array` object Dask array of given dtype """ getter = functools.partial(self.get_chunk_or_zeros, dtype=dtype) if offset: getter = _add_offset_to_slices(getter, offset) # Use dask utility function that forms the core of da.from_array dask_graph = da.core.getem(array_name, chunks, getter) return da.Array(dask_graph, array_name, chunks, dtype)
def build_array(bag, n_features, meta): name = "from-bag-" + bag.name layer = {(name, i, 0): (k, i) for k, i in bag.__dask_keys__()} dsk = dask.highlevelgraph.HighLevelGraph.from_collections( name, layer, dependencies=[bag]) chunks = ((np.nan, ) * bag.npartitions, (n_features, )) return da.Array(dsk, name, chunks, meta=meta)
def _create_window_dask(name, ntime, nchan, nbl, ncorr, token, dtype, default=0, backend="numpy", path=None): if backend == "zarr-disk" and path is None: path = mkdtemp(prefix='-'.join(('tricolour', name, 'windows', ''))) # Include name and token in new token token = dask.base.tokenize(name, ntime, nchan, nbl, ncorr, token, dtype, default, backend, path) collection_name = '-'.join(("create", name, "windows", token)) layers = { (collection_name, 0): (_create_window, name, ntime, nchan, nbl, ncorr, dtype, default, token, backend, path) } graph = HighLevelGraph.from_collections(collection_name, layers, ()) chunks = ((0, ), ) # One chunk containing single zarr array object return da.Array(graph, collection_name, chunks, dtype=np.object)
def chan_metadata(row_chan_arrays, chan_arrays, chan_bin_size): """ Create dask array with channel metadata for each chunk channel """ chan_chunks = None for array in row_chan_arrays: if array is not None: chan_chunks = array.chunks[1] break if chan_chunks is None: for array in chan_arrays: if array is not None: chan_chunks = array.chunks[0] break if chan_chunks is None: return None # Create a dask channel mapping structure name = "channel-mapper-" + tokenize(chan_chunks, chan_bin_size) layers = {(name, i): (np_channel_mapper, c, chan_bin_size) for i, c in enumerate(chan_chunks)} graph = HighLevelGraph.from_collections(name, layers, ()) chunks = (chan_chunks, ) chan_mapper = da.Array(graph, name, chunks, dtype=np.object) return chan_mapper
def download_dask_array(self, object_name, dask_name='array'): """Downloads a split matrix as a ``dask.array.Array`` object This uses the stored object metadata to reconstruct the full n-dimensional array uploaded using ``upload_dask_array``. Examples -------- >>> s3_response = cci.upload_dask_array('test_dim', arr, axis=-1) >>> dask_object = cci.download_dask_array('test_dim') >>> dask_object dask.array<array, shape=(100, 600, 1000), dtype=float64, chunksize=(100, 600, 100)> >>> dask_slice = dask_object[..., :200] >>> dask_slice dask.array<getitem..., shape=(100, 600, 1000), dtype=float64, chunksize=(100, 600, 100)> >>> downloaded_data = np.asarray(dask_slice) # this downloads the array >>> downloaded_data.shape (100, 600, 200) """ from dask import array as da metadata = self.download_json(self.pathjoin(object_name, 'metadata.json')) chunks = metadata['chunks'] shape = metadata['shape'] dtype = np.dtype(metadata['dtype']) dask = {(dask_name,) + tuple(shape): (self.download_raw_array, part_name) \ for shape, part_name in metadata['dask']} return da.Array(dask, dask_name, chunks, shape = shape, dtype = dtype)
def wrapped(shape, *args, **kwargs): if isinstance(shape, collections.abc.Iterable): shape = tuple(int(s) for s in shape) else: shape = (int(shape), ) # Estimate 100 Mi elements per block blocksize = int((100 * (2**20))**(1 / len(shape))) chunks = [] for l in shape: chunks.append([]) while l > 0: s = max(min(blocksize, l), 0) chunks[-1].append(s) l -= s name = func.__name__ + "-" + hex(random.randrange(2**64)) dsk = {} with set_backend(self._inner): for chunk_id in itertools.product( *map(lambda x: range(len(x)), chunks)): shape = tuple(chunks[i][j] for i, j in enumerate(chunk_id)) dsk[(name, ) + chunk_id] = func(shape, *args, **kwargs) meta = func(tuple(0 for _ in shape), *args, **kwargs) dtype = str(meta.dtype) return da.Array(dsk, name, chunks, dtype=dtype, meta=meta)
def cascaded_compute(callback, arrays, optimize=True): """Dask helper function for iterating over computed dask arrays. Args: callback (callable): Called with a single numpy array computed from the provided dask arrays. arrays (list, tuple): Dask arrays to pass to callback. optimize (bool): Whether to try to optimize the dask graphs of the provided arrays. Returns: `dask.Delayed` object to be computed """ if optimize: # optimize Dask graph over all objects dsk = da.Array.__dask_optimize__( # combine all Dask Array graphs dask.sharedict.merge(*[e.__dask_graph__() for e in arrays]), # get Dask Array keys in result list(dask.core.flatten([e.__dask_keys__() for e in arrays])) ) # rebuild Dask Arrays arrays = [da.Array(dsk, e.name, e.chunks, e.dtype) for e in arrays] def _callback_wrapper(arr, cb=callback, previous_call=None): del previous_call # used only for task ordering return cb(arr) current_write = None for dask_arr in arrays: current_write = dask.delayed(_callback_wrapper)( dask_arr, previous_call=current_write) return current_write
def read_band_blocks(self, blocksize=CHUNK_SIZE): """Read the band in native blocks.""" # For sentinel 1 data, the block are 1 line, and dask seems to choke on that. band = self.filehandle shape = band.shape token = tokenize(blocksize, band) name = 'read_band-' + token dskx = dict() if len(band.block_shapes) != 1: raise NotImplementedError( 'Bands with multiple shapes not supported.') else: chunks = band.block_shapes[0] def do_read(the_band, the_window, the_lock): with the_lock: return the_band.read(1, None, window=the_window) for ji, window in band.block_windows(1): dskx[(name, ) + ji] = (do_read, band, window, self.read_lock) res = da.Array(dskx, name, shape=list(shape), chunks=chunks, dtype=band.dtypes[0]) return DataArray(res, dims=('y', 'x'))
def write_blocks(source, target, region: Optional[Tuple[slice, ...]]) -> da.Array: """ Return a dask array with where each chunk contains the result of writing each chunk of `source` to `target`. """ slices = slices_from_chunks(source.chunks) if region: slices = [fuse_slice(region, slc) for slc in slices] source_name = 'store-source-' + tokenize(source) store_name = 'store-' + tokenize(source) layers = {source_name: source.__dask_graph__()} deps = {source_name: set()} dsk = {} chunks = tuple((1,) * s for s in source.blocks.shape) for slice, key in zip(slices, flatten(source.__dask_keys__())): dsk[(store_name,) + key[1:]] = (ndwrapper, store_chunk, source.ndim, key, target, slice) layers[store_name] = dsk deps[store_name] = {source_name} store_dsk = HighLevelGraph(layers, deps) return da.Array(store_dsk, store_name, shape=source.blocks.shape, chunks=chunks, dtype=int)
def concatenate_row_chunks(array, group_every=1000): """ When averaging, the output array's are substantially smaller, which can affect disk I/O since many small operations are submitted. This operation concatenates row chunks together so that more rows are submitted at once """ # Single chunk already if len(array.chunks[0]) == 1: return array data = partial_reduce(np.concatenate, array, split_every={0: group_every}, reduced_meta=None, keepdims=True) # NOTE(sjperkins) # partial_reduce sets the number of rows in each chunk # to 1, which is untrue. Correctly set the row chunks to nan, # steal the graph and recreate the array row_chunks = tuple(np.nan for _ in data.chunks[0]) chunks = (row_chunks, ) + data.chunks[1:] graph = data.__dask_graph__() return da.Array(graph, data.name, chunks, dtype=data.dtype)
def _imread(filenames, imread=None, preprocess=None): """ modified dask imread method, accepts list of file names instead of glob string """ def add_leading_dimension(x): return x[None, ...] imread = imread or sk_imread name = 'imread-%s' % tokenize(filenames, map(os.path.getmtime, filenames)) sample = imread(filenames[0]) if preprocess: sample = preprocess(sample) keys = [(name, i) + (0,) * len(sample.shape) for i in range(len(filenames))] if preprocess: values = [(add_leading_dimension, (preprocess, (imread, fn))) for fn in filenames] else: values = [(add_leading_dimension, (imread, fn)) for fn in filenames] dsk = dict(zip(keys, values)) chunks = ((1,) * len(filenames),) + tuple((d,) for d in sample.shape) return da.Array(dsk, name, chunks, sample.dtype)
def _run_fornav_single(self, data, out_chunks, target_geo_def, fill_value, **kwargs): ll2cr_result = self.cache['ll2cr_result'] ll2cr_blocks = self.cache['ll2cr_blocks'].items() ll2cr_numblocks = ll2cr_result.shape if isinstance( ll2cr_result, np.ndarray) else ll2cr_result.numblocks fornav_task_name = f"fornav-{data.name}-{ll2cr_result.name}" maximum_weight_mode = kwargs.setdefault('maximum_weight_mode', False) weight_sum_min = kwargs.setdefault('weight_sum_min', -1.0) output_stack = self._generate_fornav_dask_tasks( out_chunks, ll2cr_blocks, fornav_task_name, data.name, target_geo_def, fill_value, kwargs) dsk_graph = HighLevelGraph.from_collections( fornav_task_name, output_stack, dependencies=[data, ll2cr_result]) stack_chunks = ( (1, ) * (ll2cr_numblocks[0] * ll2cr_numblocks[1]), ) + out_chunks out_stack = da.Array(dsk_graph, fornav_task_name, stack_chunks, data.dtype) combine_fornav_with_kwargs = partial( _combine_fornav, maximum_weight_mode=maximum_weight_mode) average_fornav_with_kwargs = partial( _average_fornav, maximum_weight_mode=maximum_weight_mode, weight_sum_min=weight_sum_min, dtype=data.dtype, fill_value=fill_value) out = da.reduction(out_stack, _chunk_callable, average_fornav_with_kwargs, combine=combine_fornav_with_kwargs, axis=(0, ), dtype=data.dtype, concatenate=False) return out
def _dask_array(self, nfacet, varname, iters, klevels, k_chunksize): # return a dask array for a single facet facet_shape = _facet_shape(nfacet, self.nx) time_chunks = (len(iters) * (1, ), ) if iters is not None else () k_chunks = (tuple([len(c) for c in _chunks(klevels, k_chunksize)]), ) chunks = time_chunks + k_chunks + tuple([(s, ) for s in facet_shape]) # manually build dask graph dsk = {} token = tokenize(varname, self.store, nfacet) name = '-'.join([varname, token]) # iters == None for grid variables if iters is not None: for n_iter, iternum in enumerate(iters): for n_k, these_klevels in enumerate( _chunks(klevels, k_chunksize)): key = name, n_iter, n_k, 0, 0, 0 task = (_get_facet_chunk, self.store, varname, iternum, nfacet, these_klevels, self.nx, self.nz, self.dtype, self.mask_override) dsk[key] = task else: for n_k, these_klevels in enumerate(_chunks(klevels, k_chunksize)): key = name, n_k, 0, 0, 0 task = (_get_facet_chunk, self.store, varname, None, nfacet, these_klevels, self.nx, self.nz, self.dtype, self.mask_override) dsk[key] = task return dsa.Array(dsk, name, chunks, self.dtype)
def _make_dask_array(sources, geobox, measurement, skip_broken_datasets=False, fuse_func=None, dask_chunks=None): dsk_name = 'datacube_' + measurement['name'] irr_chunks, grid_chunks = _calculate_chunk_sizes(sources, geobox, dask_chunks) sliced_irr_chunks = (1, ) * sources.ndim dsk = {} geobox_subsets = _chunk_geobox(geobox, grid_chunks) for irr_index, datasets in numpy.ndenumerate(sources.values): for grid_index, subset_geobox in geobox_subsets.items(): dsk[(dsk_name, ) + irr_index + grid_index] = (fuse_lazy, datasets, subset_geobox, measurement, skip_broken_datasets, fuse_func, sources.ndim) data = da.Array(dsk, dsk_name, chunks=(sliced_irr_chunks + grid_chunks), dtype=measurement['dtype'], shape=(sources.shape + geobox.shape)) if irr_chunks != sliced_irr_chunks: data = data.rechunk(chunks=(irr_chunks + grid_chunks)) return data
def _rechunk_2x2(xx, name="2x2"): """ this is for testing only, ignore it, it's not robust """ assert xx.ndim == 2 name = randomize(name) ny, nx = (len(ch) // 2 for ch in xx.chunks[:2]) dsk = {} chunks = _chunk_getter(xx) for r, c in np.ndindex((ny, nx)): r2 = r * 2 c2 = c * 2 ch_idx = np.s_[r2:r2 + 2, c2:c2 + 2] _xx = chunks(ch_idx) dsk[(name, r, c)] = (_stack_2d_np, (2, 2), *_xx) chy = tuple(xx.chunks[0][i * 2] + xx.chunks[0][i * 2 + 1] for i in range(ny)) chx = tuple(xx.chunks[1][i * 2] + xx.chunks[1][i * 2 + 1] for i in range(nx)) chunks = (chy, chx) dsk = HighLevelGraph.from_collections(name, dsk, dependencies=(xx, )) return da.Array(dsk, name, chunks=chunks, dtype=xx.dtype, shape=xx.shape)
def _get_solar_flux_old(self, band): # TODO: this could be replaced with vectorized indexing in the future. from dask.base import tokenize blocksize = CHUNK_SIZE solar_flux = self.cal['solar_flux'].isel(bands=band).values d_index = self.cal['detector_index'].fillna(0).astype(int) shape = d_index.shape vchunks = range(0, shape[0], blocksize) hchunks = range(0, shape[1], blocksize) token = tokenize(band, d_index, solar_flux) name = 'solar_flux_' + token def get_items(array, slices): return solar_flux[d_index[slices].values] dsk = {(name, i, j): (get_items, d_index, (slice(vcs, min(vcs + blocksize, shape[0])), slice(hcs, min(hcs + blocksize, shape[1])))) for i, vcs in enumerate(vchunks) for j, hcs in enumerate(hchunks) } res = da.Array(dsk, name, shape=shape, chunks=(blocksize, blocksize), dtype=solar_flux.dtype) return res
def _make_dask_array(sources, geobox, measurement, skip_broken_datasets=False, fuse_func=None, dask_chunks=None): dsk_name = 'datacube_load_{name}-{token}'.format(name=measurement['name'], token=uuid.uuid4().hex) irr_chunks, grid_chunks = _calculate_chunk_sizes(sources, geobox, dask_chunks) sliced_irr_chunks = (1,) * sources.ndim dsk = {} geobox_subsets = _chunk_geobox(geobox, grid_chunks) for irr_index, datasets in numpy.ndenumerate(sources.values): for dataset in datasets: ds_token = _tokenize_dataset(dataset) dsk[ds_token] = dataset for grid_index, subset_geobox in geobox_subsets.items(): dataset_keys = [_tokenize_dataset(d) for d in select_datasets_inside_polygon(datasets, subset_geobox.extent)] dsk[(dsk_name,) + irr_index + grid_index] = (fuse_lazy, dataset_keys, subset_geobox, measurement, skip_broken_datasets, fuse_func, sources.ndim) data = da.Array(dsk, dsk_name, chunks=(sliced_irr_chunks + grid_chunks), dtype=measurement['dtype'], shape=(sources.shape + geobox.shape)) if irr_chunks != sliced_irr_chunks: data = data.rechunk(chunks=(irr_chunks + grid_chunks)) return data
def interpolate_xarray(xpoints, ypoints, values, shape, kind='cubic', blocksize=CHUNK_SIZE): """Interpolate, generating a dask array.""" vchunks = range(0, shape[0], blocksize) hchunks = range(0, shape[1], blocksize) token = tokenize(blocksize, xpoints, ypoints, values, kind, shape) name = 'interpolate-' + token from scipy.interpolate import interp2d interpolator = interp2d(xpoints, ypoints, values, kind=kind) dskx = {(name, i, j): (interpolate_slice, slice(vcs, min(vcs + blocksize, shape[0])), slice(hcs, min(hcs + blocksize, shape[1])), interpolator) for i, vcs in enumerate(vchunks) for j, hcs in enumerate(hchunks)} res = da.Array(dskx, name, shape=list(shape), chunks=(blocksize, blocksize), dtype=values.dtype) return DataArray(res, dims=('y', 'x'))
def as_daskarray(self): return da.Array( self.dask, self.name, self.chunks, self.dtype, self.shape)