def __dask_distributed_unpack__(packed_hlg, annotations: Mapping[str, Any]) -> Dict: """Unpack the high level graph for Scheduler -> Worker communication The approach is to delegate the unpackaging to each layer in the high level graph by calling ..._unpack__() and ..._annotations_unpack__() on each layer. Parameters ---------- packed_hlg : list of header and payload Packed high level graph serialized by dumps_msgpack annotations : dict A top-level annotations object which may be partially populated, and which may be further filled by annotations from the layers of the packed_hlg. Returns ------- unpacked-graph: dict dsk: Dict[str, Any] Materialized (stringified) graph of all nodes in the high level graph deps: Dict[str, set] Dependencies of each key in `dsk` annotations: Dict[str, Any] Annotations for `dsk` """ from distributed.protocol.core import loads_msgpack from distributed.protocol.serialize import import_allowed_module hlg = loads_msgpack(*packed_hlg) dsk = {} deps = {} anno = {} # Unpack each layer (in topological order) for layer in hlg["layers"]: # Find the unpack functions if layer["__module__"] is None: # Default implementation unpack_state = Layer.__dask_distributed_unpack__ unpack_anno = Layer.__dask_distributed_annotations_unpack__ else: mod = import_allowed_module(layer["__module__"]) cls = getattr(mod, layer["__name__"]) unpack_state = cls.__dask_distributed_unpack__ unpack_anno = cls.__dask_distributed_annotations_unpack__ # Unpack state into a graph and key dependencies unpacked_layer = unpack_state(layer["state"], dsk, deps) dsk.update(unpacked_layer["dsk"]) for k, v in unpacked_layer["deps"].items(): deps[k] = deps.get(k, set()) | v # Unpack the annotations if annotations and layer["annotations"]: layer_annotations = {**layer["annotations"], **annotations} else: layer_annotations = annotations or layer["annotations"] or None unpack_anno(anno, layer_annotations, unpacked_layer["dsk"].keys()) return {"dsk": dsk, "deps": deps, "annotations": anno}
def __dask_distributed_unpack__(hlg: dict) -> dict: """Unpack the high level graph for Scheduler -> Worker communication The approach is to delegate the unpackaging to each layer in the high level graph by calling ..._unpack__() and ..._annotations_unpack__() on each layer. Parameters ---------- hlg: dict Packed high level graph layers Returns ------- unpacked-graph: dict dsk: Dict[str, Any] Materialized (stringified) graph of all nodes in the high level graph deps: Dict[str, set] Dependencies of each key in `dsk` annotations: Dict[str, Any] Annotations for `dsk` """ from distributed.protocol.serialize import import_allowed_module dsk = {} deps = {} anno = {} # Unpack each layer (in topological order) for layer in hlg["layers"]: # Find the unpack functions if layer["__module__"] is None: # Default implementation unpack_state = Layer.__dask_distributed_unpack__ unpack_anno = Layer.__dask_distributed_annotations_unpack__ else: mod = import_allowed_module(layer["__module__"]) cls = getattr(mod, layer["__name__"]) unpack_state = cls.__dask_distributed_unpack__ unpack_anno = cls.__dask_distributed_annotations_unpack__ # Unpack state into a graph and key dependencies unpacked_layer = unpack_state(layer["state"], dsk, deps) dsk.update(unpacked_layer["dsk"]) for k, v in unpacked_layer["deps"].items(): deps[k] = deps.get(k, set()) | v # Unpack the annotations unpack_anno(anno, layer["annotations"], unpacked_layer["dsk"].keys()) return {"dsk": dsk, "deps": deps, "annotations": anno}
def make_blockwise_graph( func, output, out_indices, *arrind_pairs, numblocks=None, concatenate=None, new_axes=None, output_blocks=None, dims=None, deserializing=False, func_future_args=None, return_key_deps=False, io_deps=None, **kwargs, ): """Tensor operation Applies a function, ``func``, across blocks from many different input collections. We arrange the pattern with which those blocks interact with sets of matching indices. E.g.:: make_blockwise_graph(func, 'z', 'i', 'x', 'i', 'y', 'i') yield an embarrassingly parallel communication pattern and is read as $$ z_i = func(x_i, y_i) $$ More complex patterns may emerge, including multiple indices:: make_blockwise_graph(func, 'z', 'ij', 'x', 'ij', 'y', 'ji') $$ z_{ij} = func(x_{ij}, y_{ji}) $$ Indices missing in the output but present in the inputs results in many inputs being sent to one function (see examples). Examples -------- Simple embarrassing map operation >>> inc = lambda x: x + 1 >>> make_blockwise_graph(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (inc, ('x', 0, 0)), ('z', 0, 1): (inc, ('x', 0, 1)), ('z', 1, 0): (inc, ('x', 1, 0)), ('z', 1, 1): (inc, ('x', 1, 1))} Simple operation on two datasets >>> add = lambda x, y: x + y >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Operation that flips one of the datasets >>> addT = lambda x, y: x + y.T # Transpose each chunk >>> # z_ij ~ x_ij y_ji >>> # .. .. .. notice swap >>> make_blockwise_graph(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Dot product with contraction over ``j`` index. Yields list arguments >>> make_blockwise_graph(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 1), ('y', 1, 1)]), ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 1), ('y', 1, 1)])} Pass ``concatenate=True`` to concatenate arrays ahead of time >>> make_blockwise_graph(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True, ... numblocks={'x': (2, 2), 'y': (2, 2,)}) # doctest: +SKIP {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)), (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,))) ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)), (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))} Supports Broadcasting rules >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))} Support keyword arguments with apply >>> def f(a, b=0): return a + b >>> make_blockwise_graph(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10) # doctest: +SKIP {('z', 0): (apply, f, [('x', 0)], {'b': 10}), ('z', 1): (apply, f, [('x', 1)], {'b': 10})} Include literals by indexing with ``None`` >>> make_blockwise_graph(add, 'z', 'i', 'x', 'i', 100, None, numblocks={'x': (2,)}) # doctest: +SKIP {('z', 0): (add, ('x', 0), 100), ('z', 1): (add, ('x', 1), 100)} See Also -------- dask.array.blockwise dask.blockwise.blockwise """ if numblocks is None: raise ValueError("Missing required numblocks argument.") new_axes = new_axes or {} io_deps = io_deps or {} argpairs = list(toolz.partition(2, arrind_pairs)) if return_key_deps: key_deps = {} if deserializing: from distributed.protocol.serialize import import_allowed_module from distributed.worker import dumps_function, warn_dumps else: from importlib import import_module as import_allowed_module # Check if there are tuple arguments in `io_deps`. # If so, we must use this tuple to construct the actual # IO-argument mapping. io_arg_mappings = {} for arg, val in io_deps.items(): if isinstance(val, tuple): _args = io_deps[arg] module_name, attr_name = _args[0].rsplit(".", 1) io_dep_map = getattr(import_allowed_module(module_name), attr_name) if deserializing: _args = io_dep_map.__dask_distributed_unpack__(*_args) io_arg_mappings[arg] = io_dep_map(*_args[1:]) if concatenate is True: from dask.array.core import concatenate_axes as concatenate # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions dims = dims or _make_dims(argpairs, numblocks, new_axes) # Generate the abstract "plan" before constructing # the actual graph (coord_maps, concat_axes, dummies) = _get_coord_mapping( dims, output, out_indices, numblocks, argpairs, concatenate, ) # Unpack delayed objects in kwargs dsk2 = {} if kwargs: task, dsk2 = unpack_collections(kwargs) if dsk2: kwargs2 = task else: kwargs2 = kwargs # Apply Culling. # Only need to construct the specified set of output blocks output_blocks = output_blocks or itertools.product( *[range(dims[i]) for i in out_indices]) dsk = {} # Create argument lists for out_coords in output_blocks: deps = set() coords = out_coords + dummies args = [] for cmap, axes, (arg, ind) in zip(coord_maps, concat_axes, argpairs): if ind is None: if deserializing: args.append(stringify_collection_keys(arg)) else: args.append(arg) else: arg_coords = tuple(coords[c] for c in cmap) if axes: tups = lol_product((arg, ), arg_coords) if arg not in io_deps: deps.update(flatten(tups)) if concatenate: tups = (concatenate, tups, axes) else: tups = (arg, ) + arg_coords if arg not in io_deps: deps.add(tups) # Replace "place-holder" IO keys with "real" args if arg in io_deps: # We don't want to stringify keys for args # we are replacing here idx = tups[1:] if arg in io_arg_mappings: args.append(io_arg_mappings[arg][idx]) else: # The required inputs for the IO function # are specified explicitly in `io_deps` # (Or the index is the only required arg) args.append(io_deps[arg].get(idx, idx)) elif deserializing: args.append(stringify_collection_keys(tups)) else: args.append(tups) out_key = (output, ) + out_coords if deserializing: deps.update(func_future_args) args += list(func_future_args) if kwargs: val = { "function": dumps_function(apply), "args": warn_dumps(args), "kwargs": warn_dumps(kwargs2), } else: val = {"function": func, "args": warn_dumps(args)} else: if kwargs: val = (apply, func, args, kwargs2) else: args.insert(0, func) val = tuple(args) dsk[out_key] = val if return_key_deps: key_deps[out_key] = deps if dsk2: dsk.update(ensure_dict(dsk2)) if return_key_deps: return dsk, key_deps else: return dsk
def __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys): from distributed.protocol.serialize import import_allowed_module from distributed.utils import CancelledError from distributed.utils_comm import unpack_remotedata from distributed.worker import dumps_function keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) # Embed literals in `dsk` keys2 = [] indices2 = [] for key, (val, index) in zip(keys, self.indices): if index is None: # Literal dsk[key] = val else: keys2.append(key) indices2.append((val, index)) dsk = (SubgraphCallable(dsk, self.output, tuple(keys2)), ) dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True) func = dumps_function(dsk[0]) func_future_args = dsk[1:] indices = list(toolz.concat(indices2)) indices, indices_unpacked_futures = unpack_remotedata(indices, byte_keys=True) # Check the legality of the unpacked futures for future in itertools.chain(dsk_unpacked_futures, indices_unpacked_futures): if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # All blockwise tasks will depend on the futures in `indices` global_dependencies = { stringify(f.key) for f in indices_unpacked_futures } # Handle `io_deps` serialization. # If `io_deps[<collection_key>]` is just a dict, we rely # entirely on msgpack. It is up to the `Blockwise` layer to # ensure that all arguments are msgpack serializable. To enable # more control over serialization, a `BlockwiseIODeps` mapping # subclass can be defined with the necessary # `__dask_distributed_{pack,unpack}__` methods. packed_io_deps = {} for name, input_map in self.io_deps.items(): if isinstance(input_map, tuple): # Use the `__dask_distributed_pack__` definition for the # specified `BlockwiseIODeps` subclass module_name, attr_name = input_map[0].rsplit(".", 1) io_dep_map = getattr(import_allowed_module(module_name), attr_name) packed_io_deps[name] = io_dep_map.__dask_distributed_pack__( *input_map) else: packed_io_deps[name] = input_map return { "output": self.output, "output_indices": self.output_indices, "func": func, "func_future_args": func_future_args, "global_dependencies": global_dependencies, "indices": indices, "is_list": [isinstance(x, list) for x in indices], "numblocks": self.numblocks, "concatenate": self.concatenate, "new_axes": self.new_axes, "output_blocks": self.output_blocks, "dims": self.dims, "io_deps": packed_io_deps, }