Ejemplo n.º 1
0
    def __dask_distributed_unpack__(packed_hlg, annotations: Mapping[str, Any]) -> Dict:
        """Unpack the high level graph for Scheduler -> Worker communication

        The approach is to delegate the unpackaging to each layer in the high level graph
        by calling ..._unpack__() and ..._annotations_unpack__()
        on each layer.

        Parameters
        ----------
        packed_hlg : list of header and payload
            Packed high level graph serialized by dumps_msgpack
        annotations : dict
            A top-level annotations object which may be partially populated,
            and which may be further filled by annotations from the layers
            of the packed_hlg.

        Returns
        -------
        unpacked-graph: dict
            dsk: Dict[str, Any]
                Materialized (stringified) graph of all nodes in the high level graph
            deps: Dict[str, set]
                Dependencies of each key in `dsk`
            annotations: Dict[str, Any]
                Annotations for `dsk`
        """
        from distributed.protocol.core import loads_msgpack
        from distributed.protocol.serialize import import_allowed_module

        hlg = loads_msgpack(*packed_hlg)
        dsk = {}
        deps = {}
        anno = {}

        # Unpack each layer (in topological order)
        for layer in hlg["layers"]:
            # Find the unpack functions
            if layer["__module__"] is None:  # Default implementation
                unpack_state = Layer.__dask_distributed_unpack__
                unpack_anno = Layer.__dask_distributed_annotations_unpack__
            else:
                mod = import_allowed_module(layer["__module__"])
                cls = getattr(mod, layer["__name__"])
                unpack_state = cls.__dask_distributed_unpack__
                unpack_anno = cls.__dask_distributed_annotations_unpack__

            # Unpack state into a graph and key dependencies
            unpacked_layer = unpack_state(layer["state"], dsk, deps)
            dsk.update(unpacked_layer["dsk"])
            for k, v in unpacked_layer["deps"].items():
                deps[k] = deps.get(k, set()) | v

            # Unpack the annotations
            if annotations and layer["annotations"]:
                layer_annotations = {**layer["annotations"], **annotations}
            else:
                layer_annotations = annotations or layer["annotations"] or None
            unpack_anno(anno, layer_annotations, unpacked_layer["dsk"].keys())

        return {"dsk": dsk, "deps": deps, "annotations": anno}
Ejemplo n.º 2
0
    def __dask_distributed_unpack__(hlg: dict) -> dict:
        """Unpack the high level graph for Scheduler -> Worker communication

        The approach is to delegate the unpackaging to each layer in the high level graph
        by calling ..._unpack__() and ..._annotations_unpack__()
        on each layer.

        Parameters
        ----------
        hlg: dict
            Packed high level graph layers

        Returns
        -------
        unpacked-graph: dict
            dsk: Dict[str, Any]
                Materialized (stringified) graph of all nodes in the high level graph
            deps: Dict[str, set]
                Dependencies of each key in `dsk`
            annotations: Dict[str, Any]
                Annotations for `dsk`
        """
        from distributed.protocol.serialize import import_allowed_module

        dsk = {}
        deps = {}
        anno = {}

        # Unpack each layer (in topological order)
        for layer in hlg["layers"]:
            # Find the unpack functions
            if layer["__module__"] is None:  # Default implementation
                unpack_state = Layer.__dask_distributed_unpack__
                unpack_anno = Layer.__dask_distributed_annotations_unpack__
            else:
                mod = import_allowed_module(layer["__module__"])
                cls = getattr(mod, layer["__name__"])
                unpack_state = cls.__dask_distributed_unpack__
                unpack_anno = cls.__dask_distributed_annotations_unpack__

            # Unpack state into a graph and key dependencies
            unpacked_layer = unpack_state(layer["state"], dsk, deps)
            dsk.update(unpacked_layer["dsk"])
            for k, v in unpacked_layer["deps"].items():
                deps[k] = deps.get(k, set()) | v

            # Unpack the annotations
            unpack_anno(anno, layer["annotations"],
                        unpacked_layer["dsk"].keys())

        return {"dsk": dsk, "deps": deps, "annotations": anno}
Ejemplo n.º 3
0
def make_blockwise_graph(
    func,
    output,
    out_indices,
    *arrind_pairs,
    numblocks=None,
    concatenate=None,
    new_axes=None,
    output_blocks=None,
    dims=None,
    deserializing=False,
    func_future_args=None,
    return_key_deps=False,
    io_deps=None,
    **kwargs,
):
    """Tensor operation

    Applies a function, ``func``, across blocks from many different input
    collections.  We arrange the pattern with which those blocks interact with
    sets of matching indices.  E.g.::

        make_blockwise_graph(func, 'z', 'i', 'x', 'i', 'y', 'i')

    yield an embarrassingly parallel communication pattern and is read as

        $$ z_i = func(x_i, y_i) $$

    More complex patterns may emerge, including multiple indices::

        make_blockwise_graph(func, 'z', 'ij', 'x', 'ij', 'y', 'ji')

        $$ z_{ij} = func(x_{ij}, y_{ji}) $$

    Indices missing in the output but present in the inputs results in many
    inputs being sent to one function (see examples).

    Examples
    --------

    Simple embarrassing map operation

    >>> inc = lambda x: x + 1
    >>> make_blockwise_graph(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (inc, ('x', 0, 0)),
     ('z', 0, 1): (inc, ('x', 0, 1)),
     ('z', 1, 0): (inc, ('x', 1, 0)),
     ('z', 1, 1): (inc, ('x', 1, 1))}

    Simple operation on two datasets

    >>> add = lambda x, y: x + y
    >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2),
    ...                                                      'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)),
     ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)),
     ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))}

    Operation that flips one of the datasets

    >>> addT = lambda x, y: x + y.T  # Transpose each chunk
    >>> #                                        z_ij ~ x_ij y_ji
    >>> #               ..         ..         .. notice swap
    >>> make_blockwise_graph(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2),
    ...                                                       'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)),
     ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)),
     ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))}

    Dot product with contraction over ``j`` index.  Yields list arguments

    >>> make_blockwise_graph(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2),
    ...                                                          'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)],
                            [('y', 0, 0), ('y', 1, 0)]),
     ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)],
                            [('y', 0, 1), ('y', 1, 1)]),
     ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)],
                            [('y', 0, 0), ('y', 1, 0)]),
     ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)],
                            [('y', 0, 1), ('y', 1, 1)])}

    Pass ``concatenate=True`` to concatenate arrays ahead of time

    >>> make_blockwise_graph(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True,
    ...     numblocks={'x': (2, 2), 'y': (2, 2,)})  # doctest: +SKIP
    {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)),
                   (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,)))
     ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)),
                   (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))}

    Supports Broadcasting rules

    >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2),
    ...                                                      'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)),
     ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)),
     ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))}

    Support keyword arguments with apply

    >>> def f(a, b=0): return a + b
    >>> make_blockwise_graph(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10)  # doctest: +SKIP
    {('z', 0): (apply, f, [('x', 0)], {'b': 10}),
     ('z', 1): (apply, f, [('x', 1)], {'b': 10})}

    Include literals by indexing with ``None``

    >>> make_blockwise_graph(add, 'z', 'i', 'x', 'i', 100, None,  numblocks={'x': (2,)})  # doctest: +SKIP
    {('z', 0): (add, ('x', 0), 100),
     ('z', 1): (add, ('x', 1), 100)}


    See Also
    --------
    dask.array.blockwise
    dask.blockwise.blockwise
    """

    if numblocks is None:
        raise ValueError("Missing required numblocks argument.")
    new_axes = new_axes or {}
    io_deps = io_deps or {}
    argpairs = list(toolz.partition(2, arrind_pairs))

    if return_key_deps:
        key_deps = {}

    if deserializing:
        from distributed.protocol.serialize import import_allowed_module
        from distributed.worker import dumps_function, warn_dumps
    else:
        from importlib import import_module as import_allowed_module

    # Check if there are tuple arguments in `io_deps`.
    # If so, we must use this tuple to construct the actual
    # IO-argument mapping.
    io_arg_mappings = {}
    for arg, val in io_deps.items():
        if isinstance(val, tuple):
            _args = io_deps[arg]
            module_name, attr_name = _args[0].rsplit(".", 1)
            io_dep_map = getattr(import_allowed_module(module_name), attr_name)
            if deserializing:
                _args = io_dep_map.__dask_distributed_unpack__(*_args)
            io_arg_mappings[arg] = io_dep_map(*_args[1:])

    if concatenate is True:
        from dask.array.core import concatenate_axes as concatenate

    # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions
    dims = dims or _make_dims(argpairs, numblocks, new_axes)

    # Generate the abstract "plan" before constructing
    # the actual graph
    (coord_maps, concat_axes, dummies) = _get_coord_mapping(
        dims,
        output,
        out_indices,
        numblocks,
        argpairs,
        concatenate,
    )

    # Unpack delayed objects in kwargs
    dsk2 = {}
    if kwargs:
        task, dsk2 = unpack_collections(kwargs)
        if dsk2:
            kwargs2 = task
        else:
            kwargs2 = kwargs

    # Apply Culling.
    # Only need to construct the specified set of output blocks
    output_blocks = output_blocks or itertools.product(
        *[range(dims[i]) for i in out_indices])

    dsk = {}
    # Create argument lists
    for out_coords in output_blocks:
        deps = set()
        coords = out_coords + dummies
        args = []
        for cmap, axes, (arg, ind) in zip(coord_maps, concat_axes, argpairs):
            if ind is None:
                if deserializing:
                    args.append(stringify_collection_keys(arg))
                else:
                    args.append(arg)
            else:
                arg_coords = tuple(coords[c] for c in cmap)
                if axes:
                    tups = lol_product((arg, ), arg_coords)
                    if arg not in io_deps:
                        deps.update(flatten(tups))

                    if concatenate:
                        tups = (concatenate, tups, axes)
                else:
                    tups = (arg, ) + arg_coords
                    if arg not in io_deps:
                        deps.add(tups)
                # Replace "place-holder" IO keys with "real" args
                if arg in io_deps:
                    # We don't want to stringify keys for args
                    # we are replacing here
                    idx = tups[1:]
                    if arg in io_arg_mappings:
                        args.append(io_arg_mappings[arg][idx])
                    else:
                        # The required inputs for the IO function
                        # are specified explicitly in `io_deps`
                        # (Or the index is the only required arg)
                        args.append(io_deps[arg].get(idx, idx))
                elif deserializing:
                    args.append(stringify_collection_keys(tups))
                else:
                    args.append(tups)
        out_key = (output, ) + out_coords

        if deserializing:
            deps.update(func_future_args)
            args += list(func_future_args)
            if kwargs:
                val = {
                    "function": dumps_function(apply),
                    "args": warn_dumps(args),
                    "kwargs": warn_dumps(kwargs2),
                }
            else:
                val = {"function": func, "args": warn_dumps(args)}
        else:
            if kwargs:
                val = (apply, func, args, kwargs2)
            else:
                args.insert(0, func)
                val = tuple(args)
        dsk[out_key] = val
        if return_key_deps:
            key_deps[out_key] = deps

    if dsk2:
        dsk.update(ensure_dict(dsk2))

    if return_key_deps:
        return dsk, key_deps
    else:
        return dsk
Ejemplo n.º 4
0
    def __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies,
                                  client, client_keys):
        from distributed.protocol.serialize import import_allowed_module
        from distributed.utils import CancelledError
        from distributed.utils_comm import unpack_remotedata
        from distributed.worker import dumps_function

        keys = tuple(map(blockwise_token, range(len(self.indices))))
        dsk, _ = fuse(self.dsk, [self.output])

        # Embed literals in `dsk`
        keys2 = []
        indices2 = []
        for key, (val, index) in zip(keys, self.indices):
            if index is None:  # Literal
                dsk[key] = val
            else:
                keys2.append(key)
                indices2.append((val, index))

        dsk = (SubgraphCallable(dsk, self.output, tuple(keys2)), )
        dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True)

        func = dumps_function(dsk[0])
        func_future_args = dsk[1:]

        indices = list(toolz.concat(indices2))
        indices, indices_unpacked_futures = unpack_remotedata(indices,
                                                              byte_keys=True)

        # Check the legality of the unpacked futures
        for future in itertools.chain(dsk_unpacked_futures,
                                      indices_unpacked_futures):
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # All blockwise tasks will depend on the futures in `indices`
        global_dependencies = {
            stringify(f.key)
            for f in indices_unpacked_futures
        }

        # Handle `io_deps` serialization.
        # If `io_deps[<collection_key>]` is just a dict, we rely
        # entirely on msgpack.  It is up to the `Blockwise` layer to
        # ensure that all arguments are msgpack serializable. To enable
        # more control over serialization, a `BlockwiseIODeps` mapping
        # subclass can be defined with the necessary
        # `__dask_distributed_{pack,unpack}__` methods.
        packed_io_deps = {}
        for name, input_map in self.io_deps.items():
            if isinstance(input_map, tuple):
                # Use the `__dask_distributed_pack__` definition for the
                # specified `BlockwiseIODeps` subclass
                module_name, attr_name = input_map[0].rsplit(".", 1)
                io_dep_map = getattr(import_allowed_module(module_name),
                                     attr_name)
                packed_io_deps[name] = io_dep_map.__dask_distributed_pack__(
                    *input_map)
            else:
                packed_io_deps[name] = input_map

        return {
            "output": self.output,
            "output_indices": self.output_indices,
            "func": func,
            "func_future_args": func_future_args,
            "global_dependencies": global_dependencies,
            "indices": indices,
            "is_list": [isinstance(x, list) for x in indices],
            "numblocks": self.numblocks,
            "concatenate": self.concatenate,
            "new_axes": self.new_axes,
            "output_blocks": self.output_blocks,
            "dims": self.dims,
            "io_deps": packed_io_deps,
        }