def test_stringify(): obj = "Hello" assert stringify(obj) is obj obj = b"Hello" assert stringify(obj) is obj dsk = {"x": 1} assert stringify(dsk) == str(dsk) assert stringify(dsk, exclusive=()) == dsk dsk = {("x", 1): (inc, 1)} assert stringify(dsk) == str({("x", 1): (inc, 1)}) assert stringify(dsk, exclusive=()) == {("x", 1): (inc, 1)} dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))} assert stringify(dsk, exclusive=dsk) == { ("x", 1): (inc, 1), ("x", 2): (inc, str(("x", 1))), } dsks = [ { "x": 1 }, { ("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1)) }, { ("x", 1): (sum, [1, 2, 3]), ("x", 2): (sum, [("x", 1), ("x", 1)]) }, ] for dsk in dsks: sdsk = { stringify(k): stringify(v, exclusive=dsk) for k, v in dsk.items() } keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, str) for k in sdsk) assert get(dsk, keys) == get(sdsk, skeys) dsk = { ("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x", (("y", 1), )), (("z", 1), )) } dsk = stringify(dsk, exclusive=set(dsk) | {("z", 1)}) assert dsk[("y", 1)][0].dsk["x"] == "('y', 1)" assert dsk[("y", 1)][1][0] == "('z', 1)"
async def test_resources_str(c, s, a, b): pd = pytest.importorskip("pandas") dd = pytest.importorskip("dask.dataframe") await a.set_resources(MyRes=1) x = dd.from_pandas(pd.DataFrame({"A": [1, 2], "B": [3, 4]}), npartitions=1) y = x.apply(lambda row: row.sum(), axis=1, meta=(None, "int64")) yy = y.persist(resources={"MyRes": 1}) await wait(yy) ts_first = s.tasks[stringify(y.__dask_keys__()[0])] assert ts_first.resource_restrictions == {"MyRes": 1} ts_last = s.tasks[stringify(y.__dask_keys__()[-1])] assert ts_last.resource_restrictions == {"MyRes": 1}
def cause_of_failure(self, *args, keys=(), **kwargs): """ Return details of first failed task required by set of keys Parameters ---------- keys : list of keys known to the scheduler Returns ------- Dictionary with: cause: the key that failed task: the definition of that key deps: keys that the task depends on """ for key in keys: if isinstance(key, list): key = tuple(key) # ensure not a list from msgpack key = stringify(key) ts = self.scheduler.tasks.get(key) if ts is not None and ts.exception_blame is not None: cause = ts.exception_blame # NOTE: cannot serialize sets return { "deps": [dts.key for dts in cause.dependencies], "cause": cause.key, "task": cause.run_spec, }
def __dask_distributed_annotations_pack__( self, annotations: Mapping[str, Any] | None = None ) -> Mapping[str, Any] | None: """Packs Layer annotations for transmission to scheduler Callables annotations are fully expanded over Layer keys, while other values are simply transmitted as is Parameters ---------- annotations : Mapping[str, Any], optional A top-level annotations. Returns ------- packed_annotations : dict Packed annotations. """ annotations = cast( "dict[str, Any]", toolz.merge(self.annotations or {}, annotations or {})) packed = {} for a, v in annotations.items(): if callable(v): packed[a] = {stringify(k): v(k) for k in self} packed[a]["__expanded_annotations__"] = True else: packed[a] = v return packed
async def _set(self, value): if isinstance(value, Future): await self.client.scheduler.variable_set( key=stringify(value.key), name=self.name ) else: await self.client.scheduler.variable_set(data=value, name=self.name)
async def _put(self, value, timeout=None): if isinstance(value, Future): await self.client.scheduler.queue_put(key=stringify(value.key), timeout=timeout, name=self.name) else: await self.client.scheduler.queue_put(data=value, timeout=timeout, name=self.name)
async def test_compute(c, s): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = c.compute(x, priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = c.compute(y, priority=1) async with Worker(s.address, nthreads=1): await wait(high) assert all(s.processing.values()) assert s.tasks[stringify(low.key)].state in ("processing", "waiting")
def __init__(self, keys, scheduler, minimum=0, dt=0.1, complete=False): self.keys = {k.key if hasattr(k, "key") else k for k in keys} self.keys = {stringify(k) for k in self.keys} self.scheduler = scheduler self.complete = complete self._minimum = minimum self._dt = dt self.last_duration = 0 self._start_time = default_timer() self._running = False self.status = None self.extra = {}
async def test_persist_collections(c, s, a, b): da = pytest.importorskip("dask.array") x = da.arange(10, chunks=(5,)) y = x.map_blocks(lambda x: x + 1) z = y.map_blocks(lambda x: 2 * x) w = z.sum() ww, yy = c.persist([w, y], resources={tuple(y.__dask_keys__()): {"A": 1}}) await wait([ww, yy]) assert all(stringify(key) in a.data for key in y.__dask_keys__())
async def test_persist_collections(c, s, a, b): da = pytest.importorskip("dask.array") x = da.arange(10, chunks=(5, )) with dask.annotate(resources={"A": 1}): y = x.map_blocks(lambda x: x + 1) z = y.map_blocks(lambda x: 2 * x) w = z.sum() ww, yy = c.persist([w, y], optimize_graph=False) await wait([ww, yy]) assert all(stringify(key) in a.data for key in y.__dask_keys__())
async def test_persist(c, s): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) async with Worker(s.address, nthreads=1): await wait(high) assert all(s.processing.values()) assert all(s.tasks[stringify(k)].state in ("processing", "waiting") for k in flatten(low.__dask_keys__()))
def _materialized_layer_pack( layer: Layer, all_keys, known_key_dependencies, client, client_keys, ): from ..client import Future dsk = dict(layer) # Find aliases not in `client_keys` and substitute all matching keys # with its Future values = { k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys } if values: dsk = subs_multiple(dsk, values) # Unpack remote data and record its dependencies dsk = {k: unpack_remotedata(v, byte_keys=True) for k, v in layer.items()} unpacked_futures = set.union(*[v[1] for v in dsk.values()]) if dsk else set() for future in unpacked_futures: if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client.") if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) unpacked_futures_deps = {} for k, v in dsk.items(): if len(v[1]): unpacked_futures_deps[k] = {f.key for f in v[1]} dsk = {k: v[0] for k, v in dsk.items()} # Calculate dependencies without re-calculating already known dependencies missing_keys = set(dsk.keys()).difference(known_key_dependencies.keys()) dependencies = { k: keys_in_tasks(all_keys, [dsk[k]], as_list=False) for k in missing_keys } for k, v in unpacked_futures_deps.items(): dependencies[k] = set(dependencies.get(k, ())) | v # The scheduler expect all keys to be strings dependencies = { stringify(k): [stringify(dep) for dep in deps] for k, deps in dependencies.items() } all_keys = all_keys.union(dsk) dsk = { stringify(k): stringify(v, exclusive=all_keys) for k, v in dsk.items() } dsk = valmap(dumps_task, dsk) return {"dsk": dsk, "dependencies": dependencies}
def put(self, comm=None, keys=None, data=None, name=None, override=False, client=None): with log_errors(): if not override and name in self.datasets: raise KeyError("Dataset %s already exists" % name) self.scheduler.client_desires_keys( keys, "published-%s" % stringify(name)) self.datasets[name] = {"data": data, "keys": keys} return {"status": "OK", "name": name}
def __dask_distributed_annotations_unpack__( annotations: MutableMapping[str, Any], new_annotations: Mapping[str, Any] | None, keys: Iterable[Hashable], ) -> None: """ Unpack a set of layer annotations across a set of keys, then merge those expanded annotations for the layer into an existing annotations mapping. This is not a simple shallow merge because some annotations like retries, priority, workers, etc need to be able to retain keys from different layers. Parameters ---------- annotations: MutableMapping[str, Any], input/output Already unpacked annotations, which are to be updated with the new unpacked annotations new_annotations: Mapping[str, Any], optional New annotations to be unpacked into `annotations` keys: Iterable All keys in the layer. """ if new_annotations is None: return expanded = {} keys_stringified = False # Expand the new annotations across the keyset for a, v in new_annotations.items(): if type(v) is dict and "__expanded_annotations__" in v: # Maybe do a destructive update for efficiency? v = v.copy() del v["__expanded_annotations__"] expanded[a] = v else: if not keys_stringified: keys = [stringify(k) for k in keys] keys_stringified = True expanded[a] = dict.fromkeys(keys, v) # Merge the expanded annotations with the existing annotations mapping for k, v in expanded.items(): v.update(annotations.get(k, {})) annotations.update(expanded)
def delete(self, comm=None, name=None): with log_errors(): out = self.datasets.pop(name, {"keys": []}) self.scheduler.client_releases_keys( out["keys"], "published-%s" % stringify(name))
def unpack_remotedata(o, byte_keys=False, myset=None): """Unpack WrappedKey objects from collection Returns original collection and set of all found WrappedKey objects Examples -------- >>> rd = WrappedKey('mykey') >>> unpack_remotedata(1) (1, set()) >>> unpack_remotedata(()) ((), set()) >>> unpack_remotedata(rd) ('mykey', {WrappedKey('mykey')}) >>> unpack_remotedata([1, rd]) ([1, 'mykey'], {WrappedKey('mykey')}) >>> unpack_remotedata({1: rd}) ({1: 'mykey'}, {WrappedKey('mykey')}) >>> unpack_remotedata({1: [rd]}) ({1: ['mykey']}, {WrappedKey('mykey')}) Use the ``byte_keys=True`` keyword to force string keys >>> rd = WrappedKey(('x', 1)) >>> unpack_remotedata(rd, byte_keys=True) ("('x', 1)", {WrappedKey('('x', 1)')}) """ if myset is None: myset = set() out = unpack_remotedata(o, byte_keys, myset) return out, myset typ = type(o) if typ is tuple: if not o: return o if type(o[0]) is SubgraphCallable: sc = o[0] futures = set() dsk = { k: unpack_remotedata(v, byte_keys, futures) for k, v in sc.dsk.items() } args = tuple( unpack_remotedata(i, byte_keys, futures) for i in o[1:]) if futures: myset.update(futures) futures = (tuple(stringify(f.key) for f in futures) if byte_keys else tuple( f.key for f in futures)) inkeys = sc.inkeys + futures return ((SubgraphCallable(dsk, sc.outkey, inkeys, sc.name), ) + args + futures) else: return o else: return tuple( unpack_remotedata(item, byte_keys, myset) for item in o) if typ in collection_types: if not o: return o outs = [unpack_remotedata(item, byte_keys, myset) for item in o] return typ(outs) elif typ is dict: if o: return { k: unpack_remotedata(v, byte_keys, myset) for k, v in o.items() } else: return o elif issubclass(typ, WrappedKey): # TODO use type is Future k = o.key if byte_keys: k = stringify(k) myset.add(o) return k else: return o
def _process_key(self, key): if isinstance(key, list): key = tuple(key) # ensure not a list from msgpack key = stringify(key) return key
def __dask_distributed_pack__( self, all_hlg_keys: Iterable[Hashable], known_key_dependencies: Mapping[Hashable, Set], client, client_keys: Iterable[Hashable], ) -> Any: """Pack the layer for scheduler communication in Distributed This method should pack its current state and is called by the Client when communicating with the Scheduler. The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack the state, materialize the layer, and merge it into the global task graph. The returned state must be compatible with Distributed's scheduler, which means it must obey the following: - Serializable by msgpack (notice, msgpack converts lists to tuples) - All remote data must be unpacked (see unpack_remotedata()) - All keys must be converted to strings now or when unpacking - All tasks must be serialized (see dumps_task()) The default implementation materialize the layer thus layers such as Blockwise and ShuffleLayer should implement a specialized pack and unpack function in order to avoid materialization. Parameters ---------- all_hlg_keys: Iterable[Hashable] All keys in the high level graph known_key_dependencies: Mapping[Hashable, Set] Already known dependencies client: distributed.Client The client calling this function. client_keys : Iterable[Hashable] List of keys requested by the client. Returns ------- state: Object serializable by msgpack Scheduler compatible state of the layer """ from distributed.client import Future from distributed.utils import CancelledError from distributed.utils_comm import subs_multiple, unpack_remotedata from distributed.worker import dumps_task dsk = dict(self) # Find aliases not in `client_keys` and substitute all matching keys # with its Future future_aliases = { k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys } if future_aliases: dsk = subs_multiple(dsk, future_aliases) # Remove `Future` objects from graph and note any future dependencies dsk2 = {} fut_deps = {} for k, v in dsk.items(): dsk2[k], futs = unpack_remotedata(v, byte_keys=True) if futs: fut_deps[k] = futs dsk = dsk2 # Check that any collected futures are valid unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set() for future in unpacked_futures: if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # Calculate dependencies without re-calculating already known dependencies # - Start with known dependencies dependencies = ensure_dict(known_key_dependencies, copy=True) # - Remove aliases for any tasks that depend on both an alias and a future. # These can only be found in the known_key_dependencies cache, since # any dependencies computed in this method would have already had the # aliases removed. if future_aliases: alias_keys = set(future_aliases) dependencies = {k: v - alias_keys for k, v in dependencies.items()} # - Add in deps for any missing keys missing_keys = dsk.keys() - dependencies.keys() dependencies.update( (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False)) for k in missing_keys) # - Add in deps for any tasks that depend on futures for k, futures in fut_deps.items(): if futures: d = ensure_set(dependencies[k], copy=True) d.update(f.key for f in futures) dependencies[k] = d # The scheduler expect all keys to be strings dependencies = { stringify(k): {stringify(dep) for dep in deps} for k, deps in dependencies.items() } merged_hlg_keys = all_hlg_keys | dsk.keys() dsk = { stringify(k): stringify(v, exclusive=merged_hlg_keys) for k, v in dsk.items() } dsk = toolz.valmap(dumps_task, dsk) return {"dsk": dsk, "dependencies": dependencies}