def dask_hist2d(x: da.Array, y: da.Array, bins: int, range, density=False): if x.shape != y.shape: raise ValueError( f"Mismatch in argument shaoes: x.shape == {x.shape}; y.shape == {y.shape}" ) token = tokenize(x, y, bins, range, density) name = "histogram2d-sum-" + token x_keys = flatten(x.__dask_keys__()) y_keys = flatten(y.__dask_keys__()) dsk = { (name, i, 0, 0): (_block_fast_hist2d, xi, yi, bins, range) for i, (xi, yi) in enumerate(zip(x_keys, y_keys)) } dtype = np.histogram2d([], [])[0].dtype graph = HighLevelGraph.from_collections(name, dsk, dependencies=(x, y)) # turn graph into a 3D array of shape (nchunks, nbins, nbins) nchunks = len(list(flatten(x.__dask_keys__()))) chunks = ((1,) * nchunks, (bins,), (bins,)) mapped = Array(graph, name, chunks, dtype=dtype) n = mapped.sum(axis=0) return n
def persist(self, collections): """ Persist dask collections on cluster Starts computation of the collection on the cluster in the background. Provides a new dask collection that is semantically identical to the previous one, but now based off of futures currently in execution. Parameters ---------- collections: sequence or single dask object Collections like dask.array or dataframe or dask.value objects Returns ------- List of collections, or single collection, depending on type of input. Examples -------- >>> xx = executor.persist(x) # doctest: +SKIP >>> xx, yy = executor.persist([x, y]) # doctest: +SKIP See Also -------- Executor.compute """ if isinstance(collections, (tuple, list, set, frozenset)): singleton = False else: singleton = True collections = [collections] assert all(isinstance(c, Base) for c in collections) groups = groupby(lambda x: x._optimize, collections) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk2.items(): dependencies[k] |= set(_deps(dsk, v)) names = list({k for c in collections for k in flatten(c._keys())}) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk2), 'dependencies': dependencies, 'keys': names, 'client': self.id}) result = [redict_collection(c, {k: Future(k, self) for k in flatten(c._keys())}) for c in collections] if singleton: return first(result) else: return result
def test_inlined_array(): A = da.ones((10, 10), chunks=(2, 2), dtype=np.float64) B = da.full((10, 10), np.float64(2), chunks=(2, 2)) C = A + B E = C + 1 D = inlined_array(C) assert len(C.__dask_graph__().layers) == 3 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten(D.__dask_keys__())) assert_array_equal(D, C) D = inlined_array(C, [A, B]) assert len(D.__dask_graph__().layers) == 1 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten(D.__dask_keys__())) assert_array_equal(D, C) D = inlined_array(C, [A]) assert len(D.__dask_graph__().layers) == 2 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert B.name in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B]])) assert_array_equal(D, C) D = inlined_array(C, [B]) assert len(D.__dask_graph__().layers) == 2 assert D.name == C.name assert D.name in D.__dask_graph__().layers assert A.name in D.__dask_graph__().layers assert B.name not in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, A]])) assert_array_equal(D, C) D = inlined_array(E, [A]) assert len(D.__dask_graph__().layers) == 3 assert D.name == E.name assert D.name in D.__dask_graph__().layers assert B.name in D.__dask_graph__().layers assert A.name not in D.__dask_graph__().layers assert C.name in D.__dask_graph__().layers graph_keys = set(flatten(D.__dask_graph__().keys())) assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B, C]])) assert_array_equal(D, E)
def fit(self, columns: ColumnNames, ddf: dd.DataFrame): # User passed in a list of column groups. We need to figure out # if this list contains any multi-column groups, and if there # are any (obvious) problems with these groups columns_uniq = list(set(flatten(columns, container=tuple))) columns_all = list(flatten(columns, container=tuple)) if sorted(columns_all) != sorted( columns_uniq) and self.encode_type == "joint": # If we are doing "joint" encoding, there must be unique mapping # between input column names and column groups. Otherwise, more # than one unique-value table could be used to encode the same # column. raise ValueError("Same column name included in multiple groups.") for group in columns: if isinstance(group, tuple) and len(group) > 1: # For multi-column groups, we concatenate column names # to get the "group" name. name = _make_name(*group, sep=self.name_sep) for col in group: self.storage_name[col] = name # Check metadata type to reset on_host and cat_cache if the # underlying ddf is already a pandas-backed collection if isinstance(ddf._meta, pd.DataFrame): self.on_host = False # Cannot use "device" caching if the data is pandas-backed self.cat_cache = "host" if self.cat_cache == "device" else self.cat_cache if self.search_sorted: # Pandas' search_sorted only works with Series. # For now, it is safest to disallow this option. self.search_sorted = False warnings.warn( "Cannot use `search_sorted=True` for pandas-backed data.") # convert tuples to lists columns = [list(c) if isinstance(c, tuple) else c for c in columns] dsk, key = _category_stats( ddf, columns, [], [], self.out_path, self.freq_threshold, self.tree_width, self.on_host, concat_groups=self.encode_type == "joint", name_sep=self.name_sep, max_size=self.max_size, num_buckets=self.num_buckets, ) # TODO: we can't check the dtypes on the ddf here since they are incorrect # for cudf's list type. So, we're checking the partitions. fix. return Delayed(key, dsk), ddf.map_partitions(lambda df: _is_list_dtype(df))
def __init__( self, cont_names=None, stats=None, columns=None, fold_groups=None, tree_width=None, out_path=None, on_host=True, freq_threshold=None, stat_name=None, concat_groups=False, name_sep="_", fold_name="__fold__", fold_seed=42, kfold=None, ): # Set column_groups if the user has passed in a list of columns self.column_groups = None if isinstance(columns, str): columns = [columns] if isinstance(columns, list): self.column_groups = columns columns = list(set(flatten(columns, container=list))) # Add fold_groups to columns if fold_groups and kfold > 1: fold_groups = [fold_groups] if isinstance(fold_groups, str) else fold_groups columns = columns or [] self.column_groups = self.column_groups or [] for col in list(set(flatten(fold_groups, container=list))): if col not in columns: columns.append(col) super(GroupbyStatistics, self).__init__(columns) self.cont_names = cont_names or [] self.stats = stats or [] self.categories = {} self.tree_width = tree_width or 8 self.on_host = on_host self.freq_threshold = freq_threshold self.out_path = out_path or "./" self.stat_name = stat_name or "categories" self.op_name = "GroupbyStatistics-" + self.stat_name self.concat_groups = concat_groups self.name_sep = name_sep self.kfold = kfold or 3 self.fold_name = fold_name self.fold_seed = fold_seed self.fold_groups = fold_groups
def inlined_array(a, inline_arrays=None): """ Flatten underlying graph """ agraph = a.__dask_graph__() akeys = set(flatten(a.__dask_keys__())) # Inline everything except the output keys if inline_arrays is None: inline_keys = set(agraph.keys()) - akeys dsk2 = inline(agraph, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) graph = HighLevelGraph.from_collections(a.name, dsk3, []) return da.Array(graph, a.name, a.chunks, dtype=a.dtype) # We're given specific arrays to inline, promote to list if isinstance(inline_arrays, da.Array): inline_arrays = [inline_arrays] elif isinstance(inline_arrays, tuple): inline_arrays = list(inline_arrays) if not isinstance(inline_arrays, list): raise TypeError("Invalid inline_arrays, must be " "(None, list, tuple, dask.array.Array)") layers = agraph.layers.copy() deps = agraph.dependencies.copy() inline_keys = set() dsk = dict(layers[a.name]) # Inline specified arrays for array in inline_arrays: # Remove array from layers and dependencies try: dsk.update(layers.pop(array.name)) del deps[array.name] except KeyError: raise ValueError("%s is not a valid dependency of a" % array.name) # Record keys to inline inline_keys.update(flatten(array.__dask_keys__())) dsk2 = inline(dsk, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) layers[a.name] = dsk3 graph = HighLevelGraph(layers, deps) return da.Array(graph, a.name, a.chunks, a.dtype)
def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType: new_df = df.copy(deep=False) if isinstance(self.freq_threshold, dict): assert all(x in self.freq_threshold for x in columns) if self.encode_type == "combo": # Case (3) - We want to track multi- and single-column groups separately # when we are NOT performing a joint encoding. This is because # there is not a 1-to-1 mapping for columns in multi-col groups. # We use `multi_col_group` to preserve the list format of # multi-column groups only, and use `cat_names` to store the # string representation of both single- and multi-column groups. # cat_names, multi_col_group = _get_multicolumn_names(columns, df.columns, self.name_sep) else: # Case (1) & (2) - Simple 1-to-1 mapping multi_col_group = {} cat_names = list(flatten(columns, container=tuple)) # Encode each column-group separately for name in cat_names: try: # Use the column-group `list` directly (not the string name) use_name = multi_col_group.get(name, name) # Storage name may be different than group for case (2) # Only use the "aliased" `storage_name` if we are dealing with # a multi-column group, or if we are doing joint encoding if use_name != name or self.encode_type == "joint": storage_name = self.storage_name.get(name, name) else: storage_name = name if isinstance(use_name, tuple): use_name = list(use_name) path = self.categories[storage_name] new_df[name] = _encode( use_name, storage_name, path, df, self.cat_cache, na_sentinel=self.na_sentinel, freq_threshold=self.freq_threshold[name] if isinstance(self.freq_threshold, dict) else self.freq_threshold, search_sorted=self.search_sorted, buckets=self.num_buckets, encode_type=self.encode_type, cat_names=cat_names, max_size=self.max_size, ) if self.dtype: new_df[name] = new_df[name].astype(self.dtype, copy=False) except Exception as e: raise RuntimeError(f"Failed to categorical encode column {name}") from e return new_df
def optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = dsk.cull(set(flatten(keys))) return dsk
def report(report_queue, scheduler_queue, who_has, dsk, result): """ Report to outside world For a normal get function this coroutine is almost non-essential. It just starts and stops the scheduler coroutine. """ if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) out_keys = set(result_flat) scheduler_queue.put_nowait({ 'op': 'update-graph', 'dsk': dsk, 'keys': out_keys }) finished_results = {k for k in out_keys if k in who_has} while finished_results != out_keys: msg = yield report_queue.get() if msg['op'] == 'task-finished': if msg['key'] in out_keys: finished_results.add(msg['key']) if msg['op'] == 'lost-data': if msg['key'] in finished_results: finished_results.remove(msg['key']) if msg['op'] == 'task-erred': scheduler_queue.put_nowait({'op': 'close'}) raise msg['exception'] scheduler_queue.put_nowait({'op': 'close'}) raise Return(out_keys)
def cached_array(array): """ Return a new array that functionally has the same values as array, but flattens the underlying graph and introduces a cache lookup when the individual array chunks are accessed. Useful for caching data that can fit in-memory for the duration of the graph's execution. """ dsk = dict(array.__dask_graph__()) keys = set(flatten(array.__dask_keys__())) # Inline + cull everything except the current array inline_keys = set(dsk.keys() - keys) dsk2 = inline(dsk, inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, keys) # Create a cache used to store array values cache = ArrayCache(uuid.uuid4().hex) for k in keys: dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k)) graph = HighLevelGraph.from_collections(array.name, dsk3, []) return da.Array(graph, array.name, array.chunks, array.dtype)
def get_futures(self, arr): """ Return the list of futures of a Dask array associated to a cartesian communicator. Parameters ---------- arr: Dask Array A dask array distributed as the cartesian communicator. Note: the array can have more axes than the communicator as long as they are not distributed. """ if not isinstance(arr, Array): raise TypeError(f"Expected a Dask Array; got {type(arr)}.") self.check_dims(tuple(len(chunks) for chunks in arr.chunks)) idxs, _ = zip(*self.normalize_dims()) coords = self.normalize_coords() keys = tuple(flatten(arr.__dask_keys__())) key_idx = {} for key in keys: coord = tuple(key[_i + 1] for _i in idxs) key_idx[key] = coords.index(coord) keys = sorted(keys, key=key_idx.__getitem__) restrictions = { KeyPatch(key): worker for key, worker in zip(keys, self.workers) } arr = arr.persist(workers=restrictions) assert len(self) == len(arr.dask.values()) return list(arr.dask[key] for key in keys)
def optimize(dsk, keys, **kwargs): flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys] dsk, dependencies = cull(dsk, flatkeys) dsk, dependencies = fuse(dsk, keys, dependencies=dependencies, ave_width=_globals.get('fuse_ave_width', 1)) dsk, _ = cull(dsk, keys) return dsk
def _transform_ddf(ddf, column_groups, meta=None): if isinstance(column_groups, ColumnGroup): column_groups = [column_groups] columns = list(flatten(cg.flattened_columns for cg in column_groups)) # Check if we are only selecting columns (no transforms). # If so, we should perform column selection at the ddf level. # Otherwise, Dask will not push the column selection into the # IO function. if all((c.op is None and not c.parents) for c in column_groups): return ddf[_get_unique(columns)] if isinstance(meta, dict) and isinstance(ddf._meta, pd.DataFrame): dtypes = meta meta = type(ddf._meta)({k: [] for k in columns}) for column, dtype in dtypes.items(): meta[column] = meta[column].astype(dtype) elif not meta: # TODO: constructing meta like this loses dtype information on the ddf # and sets it all to 'float64'. We should propogate dtype information along # with column names in the columngroup graph. This currently only # happesn during intermediate 'fit' transforms, so as long as statoperators # don't require dtype information on the DDF this doesn't matter all that much meta = type(ddf._meta)({k: [] for k in columns}) return ddf.map_partitions( _transform_partition, column_groups, meta=meta, )
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk3.items(): dependencies[k] |= set(_deps(dsk, v)) self._send_to_scheduler({ 'op': 'update-graph', 'tasks': valmap(dumps_task, dsk3), 'dependencies': dependencies, 'keys': flatkeys, 'restrictions': restrictions or {}, 'client': self.id }) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = 'OK', result except Exception as e: result = 'error', e raise gen.Return(result)
def __init__( self, cont_names=None, stats=["count"], columns=None, tree_width=None, cat_cache="host", out_path=None, on_host=True, name_sep="_", stat_name=None, ): self.column_groups = None self.storage_name = {} self.name_sep = name_sep if isinstance(columns, str): columns = [columns] if isinstance(columns, list): self.column_groups = columns columns = list(set(flatten(columns, container=list))) for group in self.column_groups: if isinstance(group, list) and len(group) > 1: name = nvt_cat._make_name(*group, sep=self.name_sep) for col in group: self.storage_name[col] = name super().__init__(columns=columns, replace=False) self.cont_names = cont_names self.stats = stats self.tree_width = tree_width self.out_path = out_path self.on_host = on_host self.cat_cache = cat_cache self.stat_name = stat_name or "gb_categories"
def __init__(self, columns=None, num_buckets=None, freq_limit=0, encode_type="joint"): if isinstance(columns, list): columns = list(set(flatten(columns, container=list))) super().__init__(columns=columns) self.num_buckets = num_buckets self.freq_limit = freq_limit self.encode_type = encode_type
def report(report_queue, scheduler_queue, who_has, dsk, result): """ Report to outside world For a normal get function this coroutine is almost non-essential. It just starts and stops the scheduler coroutine. """ if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) out_keys = set(result_flat) scheduler_queue.put_nowait({'op': 'update-graph', 'dsk': dsk, 'keys': out_keys}) finished_results = {k for k in out_keys if k in who_has} while finished_results != out_keys: msg = yield report_queue.get() if msg['op'] == 'task-finished': if msg['key'] in out_keys: finished_results.add(msg['key']) if msg['op'] == 'lost-data': if msg['key'] in finished_results: finished_results.remove(msg['key']) if msg['op'] == 'task-erred': scheduler_queue.put_nowait({'op': 'close'}) raise msg['exception'] scheduler_queue.put_nowait({'op': 'close'}) raise Return(out_keys)
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk3.items(): dependencies[k] |= set(_deps(dsk, v)) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk3), 'dependencies': dependencies, 'keys': flatkeys, 'restrictions': restrictions or {}, 'client': self.id}) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = 'OK', result except Exception as e: result = 'error', e raise gen.Return(result)
def inline_pattern(dsk: dict, pat_ls: List[str], inline_constants: bool) -> dict: """ Inline tasks whose keys match certain patterns. Parameters ---------- dsk : dict Input dask graph. pat_ls : List[str] List of patterns to check. inline_constants : bool Whether to inline constants. Returns ------- dsk : dict Dask graph with keys inlined. See Also ------- dask.optimization.inline """ keys = [k for k in dsk.keys() if check_pat(k, pat_ls)] if keys: dsk = inline(dsk, keys, inline_constants=inline_constants) for k in keys: del dsk[k] if inline_constants: dsk, dep = cull(dsk, set(list(flatten(keys)))) return dsk
def downscale_dask( array: Any, reduction: Callable[[NDArray[Any], Tuple[int, ...]], NDArray[Any]], scale_factors: Union[int, Sequence[int], Dict[int, int]], **kwargs: Any, ) -> Any: if not np.all((np.array(array.shape) % np.array(scale_factors)) == 0): raise ValueError( f"Coarsening factors {scale_factors} do not align with array shape {array.shape}." ) array = align_chunks(array, scale_factors) name = "downscale-" + tokenize(reduction, array, scale_factors) dsk = { (name,) + key[1:]: (apply, reduction, [key, scale_factors], kwargs) for key in flatten(array.__dask_keys__()) } chunks = tuple( tuple(int(size // scale_factors[axis]) for size in sizes) for axis, sizes in enumerate(array.chunks) ) meta = reduction( np.empty(scale_factors, dtype=array.dtype), scale_factors, **kwargs ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[array]) return Array(graph, name, chunks, meta=meta)
def argtopk(a_plus_idx, k, axis, keepdims): """Chunk and combine function of argtopk Extract the indices of the k largest elements from a on the given axis. If k is negative, extract the indices of the -k smallest elements instead. Note that, unlike in the parent function, the returned elements are not sorted internally. """ assert keepdims is True axis = axis[0] if isinstance(a_plus_idx, list): a_plus_idx = list(flatten(a_plus_idx)) a = np.concatenate([ai for ai, _ in a_plus_idx], axis) idx = np.concatenate( [np.broadcast_to(idxi, ai.shape) for ai, idxi in a_plus_idx], axis) else: a, idx = a_plus_idx if abs(k) >= a.shape[axis]: return a_plus_idx idx2 = np.argpartition(a, -k, axis=axis) k_slice = slice(-k, None) if k > 0 else slice(-k) idx2 = idx2[tuple(k_slice if i == axis else slice(None) for i in range(a.ndim))] return np.take_along_axis(a, idx2, axis), np.take_along_axis(idx, idx2, axis)
def write_blocks(source, target, region: Optional[Tuple[slice, ...]]) -> da.Array: """ Return a dask array with where each chunk contains the result of writing each chunk of `source` to `target`. """ slices = slices_from_chunks(source.chunks) if region: slices = [fuse_slice(region, slc) for slc in slices] source_name = 'store-source-' + tokenize(source) store_name = 'store-' + tokenize(source) layers = {source_name: source.__dask_graph__()} deps = {source_name: set()} dsk = {} chunks = tuple((1,) * s for s in source.blocks.shape) for slice, key in zip(slices, flatten(source.__dask_keys__())): dsk[(store_name,) + key[1:]] = (ndwrapper, store_chunk, source.ndim, key, target, slice) layers[store_name] = dsk deps[store_name] = {source_name} store_dsk = HighLevelGraph(layers, deps) return da.Array(store_dsk, store_name, shape=source.blocks.shape, chunks=chunks, dtype=int)
def optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] keys = list(core.flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) else: # Perform Blockwise optimizations for HLG input dsk = optimize_dataframe_getitem(dsk, keys=keys) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Do not perform low-level fusion unless the user has # specified True explicitly. The configuration will # be None by default. if not config.get("optimization.fuse.active"): return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) fuse_subgraphs = config.get("optimization.fuse.subgraphs") if fuse_subgraphs is None: fuse_subgraphs = True dsk, _ = fuse( dsk, keys, dependencies=dependencies, fuse_subgraphs=fuse_subgraphs, ) dsk, _ = cull(dsk, keys) return dsk
def test_TaskGraph_complex(c, s, a, b): da = pytest.importorskip("dask.array") gp = TaskGraph(s) x = da.random.random((2000, 2000), chunks=(1000, 1000)) y = ((x + x.T) - x.mean(axis=0)).persist() yield wait(y) gp.update() assert len(gp.layout.index) == len(gp.node_source.data["x"]) assert len(gp.layout.index) == len(s.tasks) z = (x - y).sum().persist() yield wait(z) gp.update() assert len(gp.layout.index) == len(gp.node_source.data["x"]) assert len(gp.layout.index) == len(s.tasks) del z yield gen.sleep(0.2) gp.update() assert len(gp.layout.index) == sum(v == "True" for v in gp.node_source.data["visible"]) assert len(gp.layout.index) == len(s.tasks) assert max(gp.layout.index.values()) < len(gp.node_source.data["visible"]) assert gp.layout.next_index == len(gp.node_source.data["visible"]) gp.update() assert set(gp.layout.index.values()) == set(range(len(gp.layout.index))) visible = gp.node_source.data["visible"] keys = list(map(tokey, flatten(y.__dask_keys__()))) assert all(visible[gp.layout.index[key]] == "True" for key in keys)
def _transform_ddf(ddf, workflow_nodes, meta=None, additional_columns=None): # Check if we are only selecting columns (no transforms). # If so, we should perform column selection at the ddf level. # Otherwise, Dask will not push the column selection into the # IO function. if not workflow_nodes: return ddf[_get_unique(additional_columns)] if additional_columns else ddf if isinstance(workflow_nodes, WorkflowNode): workflow_nodes = [workflow_nodes] columns = list(flatten(wfn.output_columns.names for wfn in workflow_nodes)) columns += additional_columns if additional_columns else [] if isinstance(meta, dict) and isinstance(ddf._meta, pd.DataFrame): dtypes = meta meta = type(ddf._meta)({k: [] for k in columns}) for column, dtype in dtypes.items(): meta[column] = meta[column].astype(dtype) elif not meta: # TODO: constructing meta like this loses dtype information on the ddf # and sets it all to 'float64'. We should propagate dtype information along # with column names in the columngroup graph. This currently only # happesn during intermediate 'fit' transforms, so as long as statoperators # don't require dtype information on the DDF this doesn't matter all that much meta = type(ddf._meta)({k: [] for k in columns}) return ddf.map_partitions( _transform_partition, workflow_nodes, additional_columns=additional_columns, meta=meta, enforce_metadata=False, )
def _build_map_layer( func: Callable, prev_name: str, new_name: str, collection, dependencies: tuple[Delayed, ...] = (), ) -> Layer: """Apply func to all keys of collection. Create a Blockwise layer whenever possible; fall back to MaterializedLayer otherwise. Parameters ---------- func Callable to be invoked on the graph node prev_name : str name of the layer to map from; in case of dask base collections, this is the collection name. Note how third-party collections, e.g. xarray.Dataset, can have multiple names. new_name : str name of the layer to map to collection Arbitrary dask collection dependencies Zero or more Delayed objects, which will be passed as arbitrary variadic args to func after the collection's chunk """ if _can_apply_blockwise(collection): # Use a Blockwise layer try: numblocks = collection.numblocks except AttributeError: numblocks = (collection.npartitions, ) indices = tuple(i for i, _ in enumerate(numblocks)) kwargs = { "_deps": [d.key for d in dependencies] } if dependencies else {} return blockwise( func, new_name, indices, prev_name, indices, numblocks={prev_name: numblocks}, dependencies=dependencies, **kwargs, ) else: # Delayed, bag.Item, dataframe.core.Scalar, or third-party collection; # fall back to MaterializedLayer dep_keys = tuple(d.key for d in dependencies) return MaterializedLayer({ replace_name_in_key(k, {prev_name: new_name}): (func, k) + dep_keys for k in flatten(collection.__dask_keys__()) if get_name_from_key(k) == prev_name })
def optimize( dsk, keys, fuse_keys=None, fast_functions=None, inline_functions_fast_functions=(getter_inline,), rename_fused_keys=True, **kwargs, ): """Optimize dask for array computation 1. Cull tasks not necessary to evaluate keys 2. Remove full slicing, e.g. x[:] 3. Inline fast functions like getitem and np.transpose """ if not isinstance(keys, (list, set)): keys = [keys] keys = list(flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Perform low-level fusion unless the user has # specified False explicitly. if config.get("optimization.fuse.active") is False: return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) # Low level task optimizations if fast_functions is not None: inline_functions_fast_functions = fast_functions hold = hold_keys(dsk, dependencies) dsk, dependencies = fuse( dsk, hold + keys + (fuse_keys or []), dependencies, rename_keys=rename_fused_keys, ) if inline_functions_fast_functions: dsk = inline_functions( dsk, keys, dependencies=dependencies, fast_functions=inline_functions_fast_functions, ) return optimize_slices(dsk)
def reproject_band(band, geobox, resampling, dims, dask_chunks=None): """ Reproject a single measurement to the geobox. """ if not hasattr(band.data, 'dask') or dask_chunks is None: data = reproject_array(band.data, band.nodata, band.geobox, geobox, resampling) return wrap_in_dataarray(data, band, geobox, dims) dask_name = 'warp_{name}-{token}'.format(name=band.name, token=uuid.uuid4().hex) dependencies = [band.data] spatial_chunks = tuple( dask_chunks.get(k, geobox.shape[i]) for i, k in enumerate(geobox.dims)) gt = GeoboxTiles(geobox, spatial_chunks) new_layer = {} for tile_index in numpy.ndindex(gt.shape): sub_geobox = gt[tile_index] # find the input array slice from the output geobox reproject_roi = compute_reproject_roi(band.geobox, sub_geobox, padding=1) # find the chunk from the input array with the slice index subset_band = band[(..., ) + reproject_roi.roi_src].chunk(-1) if min(subset_band.shape) == 0: # pad the empty chunk new_layer[(dask_name, ) + tile_index] = (numpy.full, sub_geobox.shape, band.nodata, band.dtype) else: # next 3 lines to generate the new graph dependencies.append(subset_band.data) # get the input dask array for the function `reproject_array` band_key = list(flatten(subset_band.data.__dask_keys__()))[0] # generate a new layer of dask graph with reroject new_layer[(dask_name, ) + tile_index] = (reproject_array, band_key, band.nodata, subset_band.geobox, sub_geobox, resampling) # create a new graph with the additional layer and pack the graph into dask.array # since only regular chunking is allowed at the higher level dask.array interface, # to manipulate the graph seems to be the easiest way to obtain a dask.array with irregular chunks after reproject data = dask.array.Array(band.data.dask.from_collections( dask_name, new_layer, dependencies=dependencies), dask_name, chunks=spatial_chunks, dtype=band.dtype, shape=gt.base.shape) return wrap_in_dataarray(data, band, geobox, dims)
def modf(x): # Not actually object dtype, just need to specify something tmp = elemwise(np.modf, x, dtype=object) left = "modf1-" + tmp.name right = "modf2-" + tmp.name ldsk = {(left, ) + key[1:]: (getitem, key, 0) for key in core.flatten(tmp.__dask_keys__())} rdsk = {(right, ) + key[1:]: (getitem, key, 1) for key in core.flatten(tmp.__dask_keys__())} a = np.empty_like(getattr(x, "_meta", x), shape=(1, ) * x.ndim, dtype=x.dtype) l, r = np.modf(a) graph = HighLevelGraph.from_collections(left, ldsk, dependencies=[tmp]) L = Array(graph, left, chunks=tmp.chunks, meta=l) graph = HighLevelGraph.from_collections(right, rdsk, dependencies=[tmp]) R = Array(graph, right, chunks=tmp.chunks, meta=r) return L, R
def dataframe_optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] keys = list(core.flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = rewrite_simple_shuffle_layer(dsk, keys=keys) return optimize(dsk, keys, **kwargs)
def test_persist(c, s, a, b): da = pytest.importorskip('dask.array') x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) yield wait(high) assert all(s.processing.values()) assert all(s.tasks[tokey(k)].state in ('processing', 'waiting') for k in flatten(low.__dask_keys__()))
def test_persist(c, s, a, b): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) yield wait(high) assert all(s.processing.values()) assert all(s.tasks[tokey(k)].state in ("processing", "waiting") for k in flatten(low.__dask_keys__()))
def fit(self, columns: ColumnNames, ddf: dask_cudf.DataFrame): # User passed in a list of column groups. We need to figure out # if this list contains any multi-column groups, and if there # are any (obvious) problems with these groups columns_uniq = list(set(flatten(columns, container=tuple))) columns_all = list(flatten(columns, container=tuple)) if sorted(columns_all) != sorted( columns_uniq) and self.encode_type == "joint": # If we are doing "joint" encoding, there must be unique mapping # between input column names and column groups. Otherwise, more # than one unique-value table could be used to encode the same # column. raise ValueError("Same column name included in multiple groups.") for group in columns: if isinstance(group, tuple) and len(group) > 1: # For multi-column groups, we concatenate column names # to get the "group" name. name = _make_name(*group, sep=self.name_sep) for col in group: self.storage_name[col] = name # convert tuples to lists columns = [list(c) if isinstance(c, tuple) else c for c in columns] dsk, key = _category_stats( ddf, columns, [], [], self.out_path, self.freq_threshold, self.tree_width, self.on_host, concat_groups=self.encode_type == "joint", name_sep=self.name_sep, ) # TODO: we can't use the dtypes on the ddf here since they are incorrect # so we're loading from the partitions. fix. return Delayed(key, dsk), ddf.map_partitions(lambda gdf: gdf.dtypes)
async def test_persist(c, s): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) async with Worker(s.address, nthreads=1): await wait(high) assert all(s.processing.values()) assert all(s.tasks[stringify(k)].state in ("processing", "waiting") for k in flatten(low.__dask_keys__()))
def _get(self, dsk, keys, restrictions=None): flatkeys = list(flatten(keys)) for key in flatkeys: if key not in self.futures: self.futures[key] = {'event': Event(), 'status': None} futures = {key: Future(key, self) for key in flatkeys} self.scheduler_queue.put_nowait({'op': 'update-graph', 'dsk': dsk, 'keys': flatkeys, 'restrictions': restrictions or {}}) packed = pack_data(keys, futures) result = yield self._gather(packed) raise gen.Return(result)
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} self.loop.add_callback( self.scheduler_queue.put_nowait, {"op": "update-graph", "dsk": dsk, "keys": flatkeys, "restrictions": restrictions or {}}, ) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = "OK", result except Exception as e: result = "error", e raise gen.Return(result)
def progress(*futures, **kwargs): """ Track progress of futures This operates differently in the notebook and the console * Notebook: This returns immediately, leaving an IPython widget on screen * Console: This blocks until the computation completes Parameters ---------- futures: Futures A list of futures or keys to track notebook: bool (optional) Running in the notebook or not (defaults to guess) multi: bool (optional) Track different functions independently (defaults to True) complete: bool (optional) Track all keys (True) or only keys that have not yet run (False) (defaults to True) Examples -------- >>> progress(futures) # doctest: +SKIP [########################################] | 100% Completed | 1.7s """ notebook = kwargs.pop('notebook', None) multi = kwargs.pop('multi', True) complete = kwargs.pop('complete', True) assert not kwargs futures = list(flatten(list(futures))) if not isinstance(futures, (set, list)): futures = [futures] if notebook is None: notebook = is_kernel() # often but not always correct assumption if notebook: if multi: bar = MultiProgressWidget(futures, complete=complete) else: bar = ProgressWidget(futures, complete=complete) return bar else: TextProgressBar(futures, complete=complete)
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} self.send_to_scheduler({'op': 'update-graph', 'dsk': dsk, 'keys': flatkeys, 'restrictions': restrictions or {}}) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = 'OK', result except Exception as e: result = 'error', e raise gen.Return(result)
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} dsk2 = {k: unpack_remotedata(v)[0] for k, v in dsk.items()} dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True} self._send_to_scheduler({'op': 'update-graph', 'dsk': dsk3, 'keys': flatkeys, 'restrictions': restrictions or {}}) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = 'OK', result except Exception as e: result = 'error', e raise gen.Return(result)
def test_flatten(): assert list(flatten(())) == [] assert list(flatten('foo')) == ['foo']
def test_flatten(): assert list(flatten(())) == []