Ejemplo n.º 1
0
def test_ensure_dict():
    d = {'x': 1}
    assert ensure_dict(d) is d
    hlg = HighLevelGraph.from_collections('x', d)
    assert type(ensure_dict(hlg)) is dict
    assert ensure_dict(hlg) == d

    class mydict(dict):
        pass

    md = mydict()
    md['x'] = 1
    assert type(ensure_dict(md)) is dict
    assert ensure_dict(md) == d
Ejemplo n.º 2
0
def test_ensure_dict():
    d = {'x': 1}
    assert ensure_dict(d) is d
    hlg = HighLevelGraph.from_collections('x', d)
    assert type(ensure_dict(hlg)) is dict
    assert ensure_dict(hlg) == d

    class mydict(dict):
        pass

    md = mydict()
    md['x'] = 1
    assert type(ensure_dict(md)) is dict
    assert ensure_dict(md) == d
Ejemplo n.º 3
0
def test_ensure_dict():
    d = {"x": 1}
    assert ensure_dict(d) is d

    class mydict(dict):
        pass

    d2 = ensure_dict(d, copy=True)
    d3 = ensure_dict(HighLevelGraph.from_collections("x", d))
    d4 = ensure_dict(mydict(d))

    for di in (d2, d3, d4):
        assert type(di) is dict
        assert di is not d
        assert di == d
Ejemplo n.º 4
0
def test_ensure_dict():
    d = {'x': 1}
    assert ensure_dict(d) is d
    sd = ShareDict()
    sd.update(d)
    assert type(ensure_dict(sd)) is dict
    assert ensure_dict(sd) == d

    class mydict(dict):
        pass

    md = mydict()
    md['x'] = 1
    assert type(ensure_dict(md)) is dict
    assert ensure_dict(md) == d
Ejemplo n.º 5
0
def test_ensure_dict():
    d = {'x': 1}
    assert ensure_dict(d) is d
    sd = ShareDict()
    sd.update(d)
    assert type(ensure_dict(sd)) is dict
    assert ensure_dict(sd) == d

    class mydict(dict):
        pass

    md = mydict()
    md['x'] = 1
    assert type(ensure_dict(md)) is dict
    assert ensure_dict(md) == d
Ejemplo n.º 6
0
 def to_dict(self) -> dict:
     """Efficiently convert to plain dict. This method is faster than dict(self)."""
     try:
         return self._to_dict
     except AttributeError:
         out = self._to_dict = ensure_dict(self)
         return out
Ejemplo n.º 7
0
def custom_delay_optimize(
    dsk: dict, keys: list, fast_functions=[], inline_patterns=[], **kwargs
) -> dict:
    """
    Custom optimization functions for delayed tasks.

    By default only fusing of tasks will be carried out.

    Parameters
    ----------
    dsk : dict
        Input dask task graph.
    keys : list
        Output task keys.
    fast_functions : list, optional
        List of fast functions to be inlined. By default `[]`.
    inline_patterns : list, optional
        List of patterns of task keys to be inlined. By default `[]`.

    Returns
    -------
    dsk : dict
        Optimized dask graph.
    """
    dsk, _ = fuse(ensure_dict(dsk), rename_keys=custom_fused_keys_renamer)
    if inline_patterns:
        dsk = inline_pattern(dsk, inline_patterns, inline_constants=False)
    if fast_functions:
        dsk = inline_functions(
            dsk,
            [],
            fast_functions=fast_functions,
        )
    return dsk
Ejemplo n.º 8
0
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(core.flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    else:
        # Perform Blockwise optimizations for HLG input
        dsk = optimize_dataframe_getitem(dsk, keys=keys)
        dsk = optimize_blockwise(dsk, keys=keys)
        dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Do not perform low-level fusion unless the user has
    # specified True explicitly. The configuration will
    # be None by default.
    if not config.get("optimization.fuse.active"):
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    fuse_subgraphs = config.get("optimization.fuse.subgraphs")
    if fuse_subgraphs is None:
        fuse_subgraphs = True
    dsk, _ = fuse(
        dsk,
        keys,
        dependencies=dependencies,
        fuse_subgraphs=fuse_subgraphs,
    )
    dsk, _ = cull(dsk, keys)
    return dsk
Ejemplo n.º 9
0
    def _from_collection(cls, name, layer, collection):
        """`from_collections` optimized for a single collection"""
        if not is_dask_collection(collection):
            raise TypeError(type(collection))

        graph = collection.__dask_graph__()
        if isinstance(graph, HighLevelGraph):
            layers = ensure_dict(graph.layers, copy=True)
            layers[name] = layer
            deps = ensure_dict(graph.dependencies, copy=True)
            deps[name] = set(collection.__dask_layers__())
        else:
            key = _get_some_layer_name(collection)
            layers = {name: layer, key: graph}
            deps = {name: {key}, key: set()}

        return cls(layers, deps)
Ejemplo n.º 10
0
def optimize(
    dsk,
    keys,
    fuse_keys=None,
    fast_functions=None,
    inline_functions_fast_functions=(getter_inline,),
    rename_fused_keys=True,
    **kwargs,
):
    """Optimize dask for array computation

    1.  Cull tasks not necessary to evaluate keys
    2.  Remove full slicing, e.g. x[:]
    3.  Inline fast functions like getitem and np.transpose
    """
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())

    dsk = optimize_blockwise(dsk, keys=keys)
    dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    # Low level task optimizations
    if fast_functions is not None:
        inline_functions_fast_functions = fast_functions

    hold = hold_keys(dsk, dependencies)

    dsk, dependencies = fuse(
        dsk,
        hold + keys + (fuse_keys or []),
        dependencies,
        rename_keys=rename_fused_keys,
    )
    if inline_functions_fast_functions:
        dsk = inline_functions(
            dsk,
            keys,
            dependencies=dependencies,
            fast_functions=inline_functions_fast_functions,
        )

    return optimize_slices(dsk)
Ejemplo n.º 11
0
 def copy(self):
     return HighLevelGraph(
         ensure_dict(self.layers, copy=True),
         ensure_dict(self.dependencies, copy=True),
         self.key_dependencies.copy(),
     )
Ejemplo n.º 12
0
    def __dask_distributed_pack__(
        self,
        all_hlg_keys: Iterable[Hashable],
        known_key_dependencies: Mapping[Hashable, Set],
        client,
        client_keys: Iterable[Hashable],
    ) -> Any:
        """Pack the layer for scheduler communication in Distributed

        This method should pack its current state and is called by the Client when
        communicating with the Scheduler.
        The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack
        the state, materialize the layer, and merge it into the global task graph.

        The returned state must be compatible with Distributed's scheduler, which
        means it must obey the following:
          - Serializable by msgpack (notice, msgpack converts lists to tuples)
          - All remote data must be unpacked (see unpack_remotedata())
          - All keys must be converted to strings now or when unpacking
          - All tasks must be serialized (see dumps_task())

        The default implementation materialize the layer thus layers such as Blockwise
        and ShuffleLayer should implement a specialized pack and unpack function in
        order to avoid materialization.

        Parameters
        ----------
        all_hlg_keys: Iterable[Hashable]
            All keys in the high level graph
        known_key_dependencies: Mapping[Hashable, Set]
            Already known dependencies
        client: distributed.Client
            The client calling this function.
        client_keys : Iterable[Hashable]
            List of keys requested by the client.

        Returns
        -------
        state: Object serializable by msgpack
            Scheduler compatible state of the layer
        """
        from distributed.client import Future
        from distributed.utils import CancelledError
        from distributed.utils_comm import subs_multiple, unpack_remotedata
        from distributed.worker import dumps_task

        dsk = dict(self)

        # Find aliases not in `client_keys` and substitute all matching keys
        # with its Future
        future_aliases = {
            k: v
            for k, v in dsk.items()
            if isinstance(v, Future) and k not in client_keys
        }
        if future_aliases:
            dsk = subs_multiple(dsk, future_aliases)

        # Remove `Future` objects from graph and note any future dependencies
        dsk2 = {}
        fut_deps = {}
        for k, v in dsk.items():
            dsk2[k], futs = unpack_remotedata(v, byte_keys=True)
            if futs:
                fut_deps[k] = futs
        dsk = dsk2

        # Check that any collected futures are valid
        unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set()
        for future in unpacked_futures:
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # Calculate dependencies without re-calculating already known dependencies
        # - Start with known dependencies
        dependencies = ensure_dict(known_key_dependencies, copy=True)
        # - Remove aliases for any tasks that depend on both an alias and a future.
        #   These can only be found in the known_key_dependencies cache, since
        #   any dependencies computed in this method would have already had the
        #   aliases removed.
        if future_aliases:
            alias_keys = set(future_aliases)
            dependencies = {k: v - alias_keys for k, v in dependencies.items()}
        # - Add in deps for any missing keys
        missing_keys = dsk.keys() - dependencies.keys()

        dependencies.update(
            (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False))
            for k in missing_keys)
        # - Add in deps for any tasks that depend on futures
        for k, futures in fut_deps.items():
            if futures:
                d = ensure_set(dependencies[k], copy=True)
                d.update(f.key for f in futures)
                dependencies[k] = d

        # The scheduler expect all keys to be strings
        dependencies = {
            stringify(k): {stringify(dep)
                           for dep in deps}
            for k, deps in dependencies.items()
        }

        merged_hlg_keys = all_hlg_keys | dsk.keys()
        dsk = {
            stringify(k): stringify(v, exclusive=merged_hlg_keys)
            for k, v in dsk.items()
        }
        dsk = toolz.valmap(dumps_task, dsk)
        return {"dsk": dsk, "dependencies": dependencies}
Ejemplo n.º 13
0
def get(
    dsk: Mapping,
    keys: Sequence[Hashable] | Hashable,
    num_workers=None,
    func_loads=None,
    func_dumps=None,
    optimize_graph=True,
    pool=None,
    initializer=None,
    chunksize=None,
    **kwargs,
):
    """Multiprocessed get function appropriate for Bags

    Parameters
    ----------
    dsk : dict
        dask graph
    keys : object or list
        Desired results from graph
    num_workers : int
        Number of worker processes (defaults to number of cores)
    func_dumps : function
        Function to use for function serialization (defaults to cloudpickle.dumps)
    func_loads : function
        Function to use for function deserialization (defaults to cloudpickle.loads)
    optimize_graph : bool
        If True [default], `fuse` is applied to the graph before computation.
    pool : Executor or Pool
        Some sort of `Executor` or `Pool` to use
    initializer: function
        Ignored if ``pool`` has been set.
        Function to initialize a worker process before running any tasks in it.
    chunksize: int, optional
        Size of chunks to use when dispatching work.
        Defaults to 5 as some batching is helpful.
        If -1, will be computed to evenly divide ready work across workers.
    """
    chunksize = chunksize or config.get("chunksize", 6)
    pool = pool or config.get("pool", None)
    initializer = initializer or config.get("multiprocessing.initializer",
                                            None)
    num_workers = num_workers or config.get("num_workers", None) or CPU_COUNT
    if pool is None:
        # In order to get consistent hashing in subprocesses, we need to set a
        # consistent seed for the Python hash algorithm. Unfortunately, there
        # is no way to specify environment variables only for the Pool
        # processes, so we have to rely on environment variables being
        # inherited.
        if os.environ.get("PYTHONHASHSEED") in (None, "0"):
            # This number is arbitrary; it was chosen to commemorate
            # https://github.com/dask/dask/issues/6640.
            os.environ["PYTHONHASHSEED"] = "6640"
        context = get_context()
        initializer = partial(initialize_worker_process,
                              user_initializer=initializer)
        pool = ProcessPoolExecutor(num_workers,
                                   mp_context=context,
                                   initializer=initializer)
        cleanup = True
    else:
        if initializer is not None:
            warn(
                "The ``initializer`` argument is ignored when ``pool`` is provided. "
                "The user should configure ``pool`` with the needed ``initializer`` "
                "on creation.")
        if isinstance(pool, multiprocessing.pool.Pool):
            pool = MultiprocessingPoolExecutor(pool)
        cleanup = False

    # Optimize Dask
    dsk = ensure_dict(dsk)
    dsk2, dependencies = cull(dsk, keys)
    if optimize_graph:
        dsk3, dependencies = fuse(dsk2, keys, dependencies)
    else:
        dsk3 = dsk2

    # We specify marshalling functions in order to catch serialization
    # errors and report them to the user.
    loads = func_loads or config.get("func_loads", None) or _loads
    dumps = func_dumps or config.get("func_dumps", None) or _dumps

    # Note former versions used a multiprocessing Manager to share
    # a Queue between parent and workers, but this is fragile on Windows
    # (issue #1652).
    try:
        # Run
        result = get_async(
            pool.submit,
            pool._max_workers,
            dsk3,
            keys,
            get_id=_process_get_id,
            dumps=dumps,
            loads=loads,
            pack_exception=pack_exception,
            raise_exception=reraise,
            chunksize=chunksize,
            **kwargs,
        )
    finally:
        if cleanup:
            pool.shutdown()
    return result