コード例 #1
0
ファイル: dot.py プロジェクト: daviddeng/dask
def to_networkx(d, data_attributes=None, function_attributes=None):
    if data_attributes is None:
        data_attributes = dict()
    if function_attributes is None:
        function_attributes = dict()

    g = nx.DiGraph()

    for k, v in sorted(d.items(), key=lambda x: x[0]):
        g.add_node(k, shape='box', **data_attributes.get(k, dict()))
        if istask(v):
            func, args = v[0], v[1:]
            func_node = make_hashable((v, 'function'))
            g.add_node(func_node,
                       shape='circle',
                       label=name(func),
                       **function_attributes.get(k, dict()))
            g.add_edge(func_node, k)
            for dep in sorted(get_dependencies(d, k)):
                arg2 = make_hashable(dep)
                g.add_node(arg2,
                           label=str(dep),
                           shape='box',
                           **data_attributes.get(dep, dict()))
                g.add_edge(arg2, func_node)
        else:
            v_hash = make_hashable(v)
            if v_hash not in d:
                g.add_node(k, label='%s=%s' % (k, v), **data_attributes.get(k, dict()))
            else:  # alias situation
                g.add_edge(v_hash, k)

    return g
コード例 #2
0
def to_networkx(d, data_attributes=None, function_attributes=None):
    if data_attributes is None:
        data_attributes = dict()
    if function_attributes is None:
        function_attributes = dict()

    g = nx.DiGraph()

    for k, v in sorted(d.items(), key=lambda x: x[0]):
        g.add_node(k, shape='box', **data_attributes.get(k, dict()))
        if istask(v):
            func, args = v[0], v[1:]
            func_node = make_hashable((v, 'function'))
            g.add_node(func_node,
                       shape='circle',
                       label=name(func),
                       **function_attributes.get(k, dict()))
            g.add_edge(func_node, k)
            for dep in sorted(get_dependencies(d, k)):
                arg2 = make_hashable(dep)
                g.add_node(arg2,
                           label=str(dep),
                           shape='box',
                           **data_attributes.get(dep, dict()))
                g.add_edge(arg2, func_node)
        else:
            if v not in d:
                g.add_node(k,
                           label='%s=%s' % (k, v),
                           **data_attributes.get(k, dict()))
            else:  # alias situation
                g.add_edge(v, k)

    return g
コード例 #3
0
            def fire_task():
                """ Fire off a task to the thread pool """
                # Choose a good task to compute
                key = state["ready"].pop()
                state["running"].add(key)
                for f in pretask_cbs:
                    f(key, dsk, state)

                # Prep data to send
                data = {
                    dep: state["cache"][dep]
                    for dep in get_dependencies(dsk, key)
                }
                # Submit
                apply_async(
                    execute_task,
                    args=(
                        key,
                        dumps((dsk[key], data)),
                        dumps,
                        loads,
                        get_id,
                        pack_exception,
                    ),
                    callback=queue.put,
                )
コード例 #4
0
def inline_functions(dsk,
                     output,
                     fast_functions=None,
                     inline_constants=False,
                     dependencies=None):
    """Inline cheap functions into larger operations

    Examples
    --------
    >>> inc = lambda x: x + 1
    >>> add = lambda x, y: x + y
    >>> double = lambda x: x * 2
    >>> dsk = {'out': (add, 'i', 'd'),  # doctest: +SKIP
    ...        'i': (inc, 'x'),
    ...        'd': (double, 'y'),
    ...        'x': 1, 'y': 1}
    >>> inline_functions(dsk, [], [inc])  # doctest: +SKIP
    {'out': (add, (inc, 'x'), 'd'),
     'd': (double, 'y'),
     'x': 1, 'y': 1}

    Protect output keys.  In the example below ``i`` is not inlined because it
    is marked as an output key.

    >>> inline_functions(dsk, ['i', 'out'], [inc, double])  # doctest: +SKIP
    {'out': (add, 'i', (double, 'y')),
     'i': (inc, 'x'),
     'x': 1, 'y': 1}
    """
    if not fast_functions:
        return dsk

    output = set(output)

    fast_functions = set(fast_functions)

    if dependencies is None:
        dependencies = {k: get_dependencies(dsk, k) for k in dsk}
    dependents = reverse_dict(dependencies)

    def inlinable(v):
        try:
            return functions_of(v).issubset(fast_functions)
        except TypeError:
            return False

    keys = [
        k for k, v in dsk.items()
        if istask(v) and dependents[k] and k not in output and inlinable(v)
    ]

    if keys:
        dsk = inline(dsk,
                     keys,
                     inline_constants=inline_constants,
                     dependencies=dependencies)
        for k in keys:
            del dsk[k]
    return dsk
コード例 #5
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_many():
    dsk = {'a': [1, 2, 3],
           'b': 'a',
           'c': [1, (inc, 1)],
           'd': [(sum, 'c')],
           'e': ['a', 'b', 'zzz'],
           'f': [['a', 'b'], 2, 3]}

    tasks = [dsk[k] for k in ('d', 'f')]
    s = get_dependencies(dsk, task=tasks)
    assert s == {'a', 'b', 'c'}
    s = get_dependencies(dsk, task=tasks, as_list=True)
    assert sorted(s) == ['a', 'b', 'c']

    s = get_dependencies(dsk, task=[])
    assert s == set()
    s = get_dependencies(dsk, task=[], as_list=True)
    assert s == []
コード例 #6
0
def _add_logging(dsk, ignore=None):
    """
    Add logging to a Dask graph.

    @param dsk: The Dask graph.

    @return: New Dask graph.
    """
    ctx = current_action()
    result = {}

    # Use topological sort to ensure Eliot actions are in logical order of
    # execution in Dask:
    keys = toposort(dsk)

    # Give each key a string name. Some keys are just aliases to other
    # keys, so make sure we have underlying key available. Later on might
    # want to shorten them as well.
    def simplify(k):
        if isinstance(k, str):
            return k
        return "-".join(str(o) for o in k)

    key_names = {}
    for key in keys:
        value = dsk[key]
        if not callable(value) and value in keys:
            # It's an alias for another key:
            key_names[key] = key_names[value]
        else:
            key_names[key] = simplify(key)

    # 2. Create Eliot child Actions for each key, in topological order:
    key_to_action_id = {
        key: str(ctx.serialize_task_id(), "utf-8")
        for key in keys
    }

    # 3. Replace function with wrapper that logs appropriate Action:
    for key in keys:
        func = dsk[key][0]
        args = dsk[key][1:]
        if not callable(func):
            # This key is just an alias for another key, no need to add
            # logging:
            result[key] = dsk[key]
            continue
        wrapped_func = _RunWithEliotContext(
            task_id=key_to_action_id[key],
            func=func,
            key=key_names[key],
            dependencies=[key_names[k] for k in get_dependencies(dsk, key)],
        )
        result[key] = (wrapped_func, ) + tuple(args)

    assert result.keys() == dsk.keys()
    return result
コード例 #7
0
def test_get_dependencies_many():
    dsk = {
        'a': [1, 2, 3],
        'b': 'a',
        'c': [1, (inc, 1)],
        'd': [(sum, 'c')],
        'e': ['a', 'b', 'zzz'],
        'f': [['a', 'b'], 2, 3]
    }

    tasks = [dsk[k] for k in ('d', 'f')]
    s = get_dependencies(dsk, task=tasks)
    assert s == {'a', 'b', 'c'}
    s = get_dependencies(dsk, task=tasks, as_list=True)
    assert sorted(s) == ['a', 'b', 'c']

    s = get_dependencies(dsk, task=[])
    assert s == set()
    s = get_dependencies(dsk, task=[], as_list=True)
    assert s == []
コード例 #8
0
ファイル: dag.py プロジェクト: jakirkham/persist
def dask_to_digraph(dsk):
    from networkx import DiGraph
    from dask.core import get_dependencies
    g = DiGraph()
    for key, value in dsk.items():
        g.add_node(key, dict(func=value))
    for key, value in dsk.items():
        g.add_node(key, dict(func=value))
        for dep in get_dependencies(dsk, key):
            g.add_edge(dep, key)
    return g
コード例 #9
0
def test_get_dependencies_many():
    dsk = {
        "a": [1, 2, 3],
        "b": "a",
        "c": [1, (inc, 1)],
        "d": [(sum, "c")],
        "e": ["a", "b", "zzz"],
        "f": [["a", "b"], 2, 3],
    }

    tasks = [dsk[k] for k in ("d", "f")]
    s = get_dependencies(dsk, task=tasks)
    assert s == {"a", "b", "c"}
    s = get_dependencies(dsk, task=tasks, as_list=True)
    assert sorted(s) == ["a", "b", "c"]

    s = get_dependencies(dsk, task=[])
    assert s == set()
    s = get_dependencies(dsk, task=[], as_list=True)
    assert s == []
コード例 #10
0
ファイル: dask.py プロジェクト: ClusterHQ/eliot
def _add_logging(dsk, ignore=None):
    """
    Add logging to a Dask graph.

    @param dsk: The Dask graph.

    @return: New Dask graph.
    """
    ctx = current_action()
    result = {}

    # Use topological sort to ensure Eliot actions are in logical order of
    # execution in Dask:
    keys = toposort(dsk)

    # Give each key a string name. Some keys are just aliases to other
    # keys, so make sure we have underlying key available. Later on might
    # want to shorten them as well.
    def simplify(k):
        if isinstance(k, str):
            return k
        return "-".join(str(o) for o in k)

    key_names = {}
    for key in keys:
        value = dsk[key]
        if not callable(value) and value in keys:
            # It's an alias for another key:
            key_names[key] = key_names[value]
        else:
            key_names[key] = simplify(key)

    # 2. Create Eliot child Actions for each key, in topological order:
    key_to_action_id = {key: str(ctx.serialize_task_id(), "utf-8") for key in keys}

    # 3. Replace function with wrapper that logs appropriate Action:
    for key in keys:
        func = dsk[key][0]
        args = dsk[key][1:]
        if not callable(func):
            # This key is just an alias for another key, no need to add
            # logging:
            result[key] = dsk[key]
            continue
        wrapped_func = _RunWithEliotContext(
            task_id=key_to_action_id[key],
            func=func,
            key=key_names[key],
            dependencies=[key_names[k] for k in get_dependencies(dsk, key)],
        )
        result[key] = (wrapped_func,) + tuple(args)

    assert result.keys() == dsk.keys()
    return result
コード例 #11
0
ファイル: vis.py プロジェクト: Jxt1/arlo
def simple_vis(x, filename='simple', format=None):
    if hasattr(x, 'dask'):
        dsk = x._optimize(x.dask, x._keys())
    else:
        dsk = x

    deps = {k: get_dependencies(dsk, k) for k in dsk}

    g = graphviz.Digraph(graph_attr={'rankdir': 'LR'})

    nodes = set()
    edges = set()
    for k in dsk:
        key = node_key(k)
        if key not in nodes:
            g.node(key, label=key_split(k), shape='rectangle')
            nodes.add(key)
        for dep in deps[k]:
            dep_key = node_key(dep)
            if dep_key not in nodes:
                g.node(dep_key, label=key_split(dep), shape='rectangle')
                nodes.add(dep_key)
            # Avoid circular references
            if dep_key != key and (dep_key, key) not in edges:
                g.edge(dep_key, key)
                edges.add((dep_key, key))

    fmts = ['.png', '.pdf', '.dot', '.svg', '.jpeg', '.jpg']
    if format is None and any(filename.lower().endswith(fmt) for fmt in fmts):
        filename, format = os.path.splitext(filename)
        format = format[1:].lower()

    if format is None:
        format = 'png'

    data = g.pipe(format=format)
    if not data:
        raise RuntimeError("Graphviz failed to properly produce an image. "
                           "This probably means your installation of graphviz "
                           "is missing png support. See: "
                           "https://github.com/ContinuumIO/anaconda-issues/"
                           "issues/485 for more information.")

    display_cls = _get_display_cls(format)

    if not filename:
        return display_cls(data=data)

    full_filename = '.'.join([filename, format])
    with open(full_filename, 'wb') as f:
        f.write(data)

    return display_cls(filename=full_filename)
コード例 #12
0
def cull(dsk, keys):
    """Return new dask with only the tasks required to calculate keys.

    In other words, remove unnecessary tasks from dask.
    ``keys`` may be a single key or list of keys.

    Examples
    --------
    >>> def inc(x):
    ...     return x + 1

    >>> def add(x, y):
    ...     return x + y

    >>> d = {'x': 1, 'y': (inc, 'x'), 'out': (add, 'x', 10)}
    >>> dsk, dependencies = cull(d, 'out')
    >>> dsk                                                     # doctest: +ELLIPSIS
    {'out': (<function add at ...>, 'x', 10), 'x': 1}
    >>> dependencies                                            # doctest: +ELLIPSIS
    {'out': ['x'], 'x': []}

    Returns
    -------
    dsk: culled dask graph
    dependencies: Dict mapping {key: [deps]}.  Useful side effect to accelerate
        other optimizations, notably fuse.
    """
    if not isinstance(keys, (list, set)):
        keys = [keys]

    seen = set()
    dependencies = dict()
    out = {}
    work = list(set(flatten(keys)))

    while work:
        new_work = []
        for k in work:
            dependencies_k = get_dependencies(dsk, k,
                                              as_list=True)  # fuse needs lists
            out[k] = dsk[k]
            dependencies[k] = dependencies_k
            for d in dependencies_k:
                if d not in seen:
                    seen.add(d)
                    new_work.append(d)

        work = new_work

    return out, dependencies
コード例 #13
0
            def fire_tasks(chunksize):
                """Fire off a task to the thread pool"""
                # Determine chunksize and/or number of tasks to submit
                nready = len(state["ready"])
                if chunksize == -1:
                    ntasks = nready
                    chunksize = -(ntasks // -num_workers)
                else:
                    used_workers = -(len(state["running"]) // -chunksize)
                    avail_workers = max(num_workers - used_workers, 0)
                    ntasks = min(nready, chunksize * avail_workers)

                # Prep all ready tasks for submission
                args = []
                for _ in range(ntasks):
                    # Get the next task to compute (most recently added)
                    key = state["ready"].pop()
                    # Notify task is running
                    state["running"].add(key)
                    for f in pretask_cbs:
                        f(key, dsk, state)

                    # Prep args to send
                    data = {
                        dep: state["cache"][dep]
                        for dep in get_dependencies(dsk, key)
                    }
                    args.append((
                        key,
                        dumps((dsk[key], data)),
                        dumps,
                        loads,
                        get_id,
                        pack_exception,
                    ))

                # Batch submit
                for i in range(-(len(args) // -chunksize)):
                    each_args = args[i * chunksize:(i + 1) * chunksize]
                    if not each_args:
                        break
                    fut = submit(batch_execute_tasks, each_args)
                    fut.add_done_callback(queue.put)
コード例 #14
0
 def maybe_wrap(key, value):
     if isinstance(value, list):
         return [maybe_wrap(key, v) for v in value]
     elif isinstance(value, tuple):
         func = value[0]
         args = value[1:]
         if not callable(func):
             # Not a callable, so nothing to wrap.
             return value
         wrapped_func = _RunWithEliotContext(
             task_id=str(ctx.serialize_task_id(), "utf-8"),
             func=func,
             key=key_names[key],
             dependencies=[
                 key_names[k] for k in get_dependencies(dsk, key)
             ],
         )
         return (wrapped_func, ) + args
     else:
         return value
コード例 #15
0
    def visualize(self, x, filename="simple_computation_graph", format=None):

        if hasattr(x, "dask"):
            dsk = x.__dask_optimize__(x.dask, x.__dask_keys__())
        else:
            dsk = x

        deps = {k: get_dependencies(dsk, k) for k in dsk}

        g = graphviz.Digraph(araph_attr={"rankdir": "LR"})

        nodes = set()
        edges = set()
        for k in dsk:
            key = self._node_key(k)
            if key not in nodes:
                g.node(key, label=key_split(k), shape="rectangle")
                nodes.add(key)
            for dep in deps[k]:
                dep_key = self._node_key(dep)
                if dep_key not in nodes:
                    g.node(dep_key, label=key_split(dep), shape="rectangle")
                    nodes.add(dep_key)
                # Avoid circular references
                if dep_key != key and (dep_key, key) not in edges:
                    g.edge(dep_key, key)
                    edges.add((dep_key, key))

        data = g.pipe(format=self.format)
        display_cls = _get_display_cls(self.format)

        if self.filename is None:
            return display_cls(data=data)

        full_filename = ".".join([filename, self.format])
        with open(full_filename, "wb") as f:
            f.write(data)

        return display_cls(filename=full_filename)
コード例 #16
0
ファイル: dataflowenvironment.py プロジェクト: eserie/dafpy
def dask_to_dataflow(dsk):
    from dask.core import istask, get_dependencies
    g = DataflowEnvironment()
    for k, v in dsk.items():
        k_name = k
        g.add_node(k_name, v[0])
    for k, v in dsk.items():
        k_name = k
        if callable(v[0]):
            deps = list(get_dependencies(dsk, k))
            print k_name, v[0], deps
            g.add_node(
                k_name,
                v[0],
                args=['args_{}'.format(i) for i, dep in enumerate(deps)])
            for i, dep in enumerate(deps):
                g.add_edge(dep, k_name, dfpd.OBJ_CALL_RETS,
                           'args_{}'.format(i))
    if 'sum_alldata' in g:
        g.add_edge_call_rets('sum_alldata')
    g.start()
    return g
コード例 #17
0
 def setup(self):
     cols = 1000
     rows = 10
     width = 3
     height = 2
     dsk = {}
     inline_keys = set()
     for r in range(rows):
         if r == 0:
             update = {("x", r, c): 0 for c in range(cols)}
             x = "x"
         elif not (r - 1) % (height + 1):
             # Start of a diamond
             update = {
                 ("add", r, c, w): (add, (x, r - 1, c), w)
                 for w in range(width) for c in range(cols)
             }
             x = "add"
         elif r % (height + 1):
             # In a diamond
             update = {
                 ("inc", r, c, w): (inc, (x, r - 1, c, w))
                 for w in range(width) for c in range(cols)
             }
             inline_keys.update(update)
             x = "inc"
         else:
             # End of a diamond
             update = {
                 ("sum", r, c): (sum, [(x, r - 1, c, w) for w in range(width)])
                 for c in range(cols)
             }
             x = "sum"
         dsk.update(update)
     keys = list(update)
     self.dsk = dsk
     self.keys = keys
     self.inline_keys = inline_keys
     self.deps = {k: get_dependencies(dsk, k, as_list=False) for k in dsk}
コード例 #18
0
ファイル: test_core.py プロジェクト: pwolfram/dask
def test_get_dependencies_empty():
    dsk = {'x': (inc, )}
    assert get_dependencies(dsk, 'x') == set()
コード例 #19
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_list():
    dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]}
    assert get_dependencies(dsk, 'z') == set(['x', 'y'])
    assert sorted(get_dependencies(dsk, 'z', as_list=True)) == ['x', 'y']
コード例 #20
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_empty():
    dsk = {'x': (inc,)}
    assert get_dependencies(dsk, 'x') == set()
    assert get_dependencies(dsk, 'x', as_list=True) == []
コード例 #21
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_nested():
    dsk = {'x': 1, 'y': 2,
           'z': (add, (inc, [['x']]), 'y')}

    assert get_dependencies(dsk, 'z') == set(['x', 'y'])
    assert sorted(get_dependencies(dsk, 'z', as_list=True)) == ['x', 'y']
コード例 #22
0
def test_get_dependencies_task():
    dsk = {"x": 1, "y": 2, "z": ["x", [(inc, "y")]]}
    assert get_dependencies(dsk, task=(inc, "x")) == set(["x"])
    assert get_dependencies(dsk, task=(inc, "x"), as_list=True) == ["x"]
コード例 #23
0
def get_async(
    submit,
    num_workers,
    dsk,
    result,
    cache=None,
    get_id=default_get_id,
    rerun_exceptions_locally=None,
    pack_exception=default_pack_exception,
    raise_exception=reraise,
    callbacks=None,
    dumps=identity,
    loads=identity,
    chunksize=None,
    **kwargs,
):
    """Asynchronous get function

    This is a general version of various asynchronous schedulers for dask.  It
    takes a ``concurrent.futures.Executor.submit`` function to form a more
    specific ``get`` method that walks through the dask array with parallel
    workers, avoiding repeat computation and minimizing memory use.

    Parameters
    ----------
    submit : function
        A ``concurrent.futures.Executor.submit`` function
    num_workers : int
        The number of workers that task submissions can be spread over
    dsk : dict
        A dask dictionary specifying a workflow
    result : key or list of keys
        Keys corresponding to desired data
    cache : dict-like, optional
        Temporary storage of results
    get_id : callable, optional
        Function to return the worker id, takes no arguments. Examples are
        `threading.current_thread` and `multiprocessing.current_process`.
    rerun_exceptions_locally : bool, optional
        Whether to rerun failing tasks in local process to enable debugging
        (False by default)
    pack_exception : callable, optional
        Function to take an exception and ``dumps`` method, and return a
        serialized tuple of ``(exception, traceback)`` to send back to the
        scheduler. Default is to just raise the exception.
    raise_exception : callable, optional
        Function that takes an exception and a traceback, and raises an error.
    callbacks : tuple or list of tuples, optional
        Callbacks are passed in as tuples of length 5. Multiple sets of
        callbacks may be passed in as a list of tuples. For more information,
        see the dask.diagnostics documentation.
    dumps: callable, optional
        Function to serialize task data and results to communicate between
        worker and parent.  Defaults to identity.
    loads: callable, optional
        Inverse function of `dumps`.  Defaults to identity.
    chunksize: int, optional
        Size of chunks to use when dispatching work. Defaults to 1.
        If -1, will be computed to evenly divide ready work across workers.

    See Also
    --------
    threaded.get
    """
    chunksize = chunksize or config.get("chunksize", 1)

    queue = Queue()

    if isinstance(result, list):
        result_flat = set(flatten(result))
    else:
        result_flat = {result}
    results = set(result_flat)

    dsk = dict(dsk)
    with local_callbacks(callbacks) as callbacks:
        _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks)
        started_cbs = []
        succeeded = False
        # if start_state_from_dask fails, we will have something
        # to pass to the final block.
        state = {}
        try:
            for cb in callbacks:
                if cb[0]:
                    cb[0](dsk)
                started_cbs.append(cb)

            keyorder = order(dsk)

            state = start_state_from_dask(dsk,
                                          cache=cache,
                                          sortkey=keyorder.get)

            for _, start_state, _, _, _ in callbacks:
                if start_state:
                    start_state(dsk, state)

            if rerun_exceptions_locally is None:
                rerun_exceptions_locally = config.get(
                    "rerun_exceptions_locally", False)

            if state["waiting"] and not state["ready"]:
                raise ValueError("Found no accessible jobs in dask")

            def fire_tasks(chunksize):
                """Fire off a task to the thread pool"""
                # Determine chunksize and/or number of tasks to submit
                nready = len(state["ready"])
                if chunksize == -1:
                    ntasks = nready
                    chunksize = -(ntasks // -num_workers)
                else:
                    used_workers = -(len(state["running"]) // -chunksize)
                    avail_workers = max(num_workers - used_workers, 0)
                    ntasks = min(nready, chunksize * avail_workers)

                # Prep all ready tasks for submission
                args = []
                for _ in range(ntasks):
                    # Get the next task to compute (most recently added)
                    key = state["ready"].pop()
                    # Notify task is running
                    state["running"].add(key)
                    for f in pretask_cbs:
                        f(key, dsk, state)

                    # Prep args to send
                    data = {
                        dep: state["cache"][dep]
                        for dep in get_dependencies(dsk, key)
                    }
                    args.append((
                        key,
                        dumps((dsk[key], data)),
                        dumps,
                        loads,
                        get_id,
                        pack_exception,
                    ))

                # Batch submit
                for i in range(-(len(args) // -chunksize)):
                    each_args = args[i * chunksize:(i + 1) * chunksize]
                    if not each_args:
                        break
                    fut = submit(batch_execute_tasks, each_args)
                    fut.add_done_callback(queue.put)

            # Main loop, wait on tasks to finish, insert new ones
            while state["waiting"] or state["ready"] or state["running"]:
                fire_tasks(chunksize)
                for key, res_info, failed in queue_get(queue).result():
                    if failed:
                        exc, tb = loads(res_info)
                        if rerun_exceptions_locally:
                            data = {
                                dep: state["cache"][dep]
                                for dep in get_dependencies(dsk, key)
                            }
                            task = dsk[key]
                            _execute_task(task, data)  # Re-execute locally
                        else:
                            raise_exception(exc, tb)
                    res, worker_id = loads(res_info)
                    state["cache"][key] = res
                    finish_task(dsk, key, state, results, keyorder.get)
                    for f in posttask_cbs:
                        f(key, res, dsk, state, worker_id)

            succeeded = True

        finally:
            for _, _, _, _, finish in started_cbs:
                if finish:
                    finish(dsk, state, not succeeded)

    return nested_get(result, state["cache"])
コード例 #24
0
def fuse_linear(dsk, keys=None, dependencies=None, rename_keys=True):
    """Return new dask graph with linear sequence of tasks fused together.

    If specified, the keys in ``keys`` keyword argument are *not* fused.
    Supply ``dependencies`` from output of ``cull`` if available to avoid
    recomputing dependencies.

    **This function is mostly superseded by ``fuse``**

    Parameters
    ----------
    dsk: dict
    keys: list
    dependencies: dict, optional
        {key: [list-of-keys]}.  Must be a list to provide count of each key
        This optional input often comes from ``cull``
    rename_keys: bool or func, optional
        Whether to rename fused keys with ``default_fused_linear_keys_renamer``
        or not.  Renaming fused keys can keep the graph more understandable
        and comprehensive, but it comes at the cost of additional processing.
        If False, then the top-most key will be used.  For advanced usage, a
        func is also accepted, ``new_key = rename_keys(fused_key_list)``.

    Examples
    --------
    >>> def inc(x):
    ...     return x + 1

    >>> def add(x, y):
    ...     return x + y

    >>> d = {'a': 1, 'b': (inc, 'a'), 'c': (inc, 'b')}
    >>> dsk, dependencies = fuse(d)
    >>> dsk # doctest: +SKIP
    {'a-b-c': (inc, (inc, 1)), 'c': 'a-b-c'}
    >>> dsk, dependencies = fuse(d, rename_keys=False)
    >>> dsk # doctest: +ELLIPSIS
    {'c': (<function inc at ...>, (<function inc at ...>, 1))}
    >>> dsk, dependencies = fuse(d, keys=['b'], rename_keys=False)
    >>> dsk  # doctest: +ELLIPSIS
    {'b': (<function inc at ...>, 1), 'c': (<function inc at ...>, 'b')}

    Returns
    -------
    dsk: output graph with keys fused
    dependencies: dict mapping dependencies after fusion.  Useful side effect
        to accelerate other downstream optimizations.
    """
    if keys is not None and not isinstance(keys, set):
        if not isinstance(keys, list):
            keys = [keys]
        keys = set(flatten(keys))

    if dependencies is None:
        dependencies = {k: get_dependencies(dsk, k, as_list=True) for k in dsk}

    # locate all members of linear chains
    child2parent = {}
    unfusible = set()
    for parent in dsk:
        deps = dependencies[parent]
        has_many_children = len(deps) > 1
        for child in deps:
            if keys is not None and child in keys:
                unfusible.add(child)
            elif child in child2parent:
                del child2parent[child]
                unfusible.add(child)
            elif has_many_children:
                unfusible.add(child)
            elif child not in unfusible:
                child2parent[child] = parent

    # construct the chains from ancestor to descendant
    chains = []
    parent2child = dict(map(reversed, child2parent.items()))
    while child2parent:
        child, parent = child2parent.popitem()
        chain = [child, parent]
        while parent in child2parent:
            parent = child2parent.pop(parent)
            del parent2child[parent]
            chain.append(parent)
        chain.reverse()
        while child in parent2child:
            child = parent2child.pop(child)
            del child2parent[child]
            chain.append(child)
        chains.append(chain)

    dependencies = {k: set(v) for k, v in dependencies.items()}

    if rename_keys is True:
        key_renamer = default_fused_linear_keys_renamer
    elif rename_keys is False:
        key_renamer = None
    else:
        key_renamer = rename_keys

    # create a new dask with fused chains
    rv = {}
    fused = set()
    aliases = set()
    is_renamed = False
    for chain in chains:
        if key_renamer is not None:
            new_key = key_renamer(chain)
            is_renamed = (
                new_key is not None and new_key not in dsk and new_key not in rv
            )
        child = chain.pop()
        val = dsk[child]
        while chain:
            parent = chain.pop()
            dependencies[parent].update(dependencies.pop(child))
            dependencies[parent].remove(child)
            val = subs(dsk[parent], child, val)
            fused.add(child)
            child = parent
        fused.add(child)
        if is_renamed:
            rv[new_key] = val
            rv[child] = new_key
            dependencies[new_key] = dependencies[child]
            dependencies[child] = {new_key}
            aliases.add(child)
        else:
            rv[child] = val
    for key, val in dsk.items():
        if key not in fused:
            rv[key] = val
    if aliases:
        for key, deps in dependencies.items():
            for old_key in deps & aliases:
                new_key = rv[old_key]
                deps.remove(old_key)
                deps.add(new_key)
                rv[key] = subs(rv[key], old_key, new_key)
        if keys is not None:
            for key in aliases - keys:
                del rv[key]
                del dependencies[key]
    return rv, dependencies
コード例 #25
0
def inline(dsk, keys=None, inline_constants=True, dependencies=None):
    """Return new dask with the given keys inlined with their values.

    Inlines all constants if ``inline_constants`` keyword is True. Note that
    the constant keys will remain in the graph, to remove them follow
    ``inline`` with ``cull``.

    Examples
    --------
    >>> def inc(x):
    ...     return x + 1

    >>> def add(x, y):
    ...     return x + y

    >>> d = {'x': 1, 'y': (inc, 'x'), 'z': (add, 'x', 'y')}
    >>> inline(d)       # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, 'y')}

    >>> inline(d, keys='y') # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, (<function inc at ...>, 1))}

    >>> inline(d, keys='y', inline_constants=False) # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 'x'), 'z': (<function add at ...>, 'x', (<function inc at ...>, 'x'))}
    """
    if dependencies and isinstance(next(iter(dependencies.values())), list):
        dependencies = {k: set(v) for k, v in dependencies.items()}

    keys = _flat_set(keys)

    if dependencies is None:
        dependencies = {k: get_dependencies(dsk, k) for k in dsk}

    if inline_constants:
        keys.update(
            k
            for k, v in dsk.items()
            if (ishashable(v) and v in dsk) or (not dependencies[k] and not istask(v))
        )

    # Keys may depend on other keys, so determine replace order with toposort.
    # The values stored in `keysubs` do not include other keys.
    replaceorder = toposort(
        {k: dsk[k] for k in keys if k in dsk}, dependencies=dependencies
    )
    keysubs = {}
    for key in replaceorder:
        val = dsk[key]
        for dep in keys & dependencies[key]:
            if dep in keysubs:
                replace = keysubs[dep]
            else:
                replace = dsk[dep]
            val = subs(val, dep, replace)
        keysubs[key] = val

    # Make new dask with substitutions
    dsk2 = keysubs.copy()
    for key, val in dsk.items():
        if key not in dsk2:
            for item in keys & dependencies[key]:
                val = subs(val, item, keysubs[item])
            dsk2[key] = val
    return dsk2
コード例 #26
0
def get_async(apply_async,
              num_workers,
              dsk,
              result,
              cache=None,
              get_id=default_get_id,
              rerun_exceptions_locally=None,
              pack_exception=default_pack_exception,
              raise_exception=reraise,
              callbacks=None,
              dumps=identity,
              loads=identity,
              **kwargs):
    """Asynchronous get function
    This is a general version of various asynchronous schedulers for dask.  It
    takes a an apply_async function as found on Pool objects to form a more
    specific ``get`` method that walks through the dask array with parallel
    workers, avoiding repeat computation and minimizing memory use.
    Parameters
    ----------
    apply_async : function
        Asynchronous apply function as found on Pool or ThreadPool
    num_workers : int
        The number of active tasks we should have at any one time
    dsk : dict
        A dask dictionary specifying a workflow
    result : key or list of keys
        Keys corresponding to desired data
    cache : dict-like, optional
        Temporary storage of results
    get_id : callable, optional
        Function to return the worker id, takes no arguments. Examples are
        `threading.current_thread` and `multiprocessing.current_process`.
    rerun_exceptions_locally : bool, optional
        Whether to rerun failing tasks in local process to enable debugging
        (False by default)
    pack_exception : callable, optional
        Function to take an exception and ``dumps`` method, and return a
        serialized tuple of ``(exception, traceback)`` to send back to the
        scheduler. Default is to just raise the exception.
    raise_exception : callable, optional
        Function that takes an exception and a traceback, and raises an error.
    dumps: callable, optional
        Function to serialize task data and results to communicate between
        worker and parent.  Defaults to identity.
    loads: callable, optional
        Inverse function of `dumps`.  Defaults to identity.
    callbacks : tuple or list of tuples, optional
        Callbacks are passed in as tuples of length 5. Multiple sets of
        callbacks may be passed in as a list of tuples. For more information,
        see the dask.diagnostics documentation.
    See Also
    --------
    threaded.get
    """
    queue = Queue()

    if isinstance(result, list):
        result_flat = set(flatten(result))
    else:
        result_flat = {result}
    results = set(result_flat)

    dsk = dict(dsk)
    with local_callbacks(callbacks) as callbacks:
        _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks)
        started_cbs = []
        succeeded = False
        # if start_state_from_dask fails, we will have something
        # to pass to the final block.
        state = {}
        try:
            for cb in callbacks:
                if cb[0]:
                    cb[0](dsk)
                started_cbs.append(cb)

            keyorder = order(dsk)

            state = start_state_from_dask(dsk,
                                          cache=cache,
                                          sortkey=keyorder.get)

            for _, start_state, _, _, _ in callbacks:
                if start_state:
                    start_state(dsk, state)

            if rerun_exceptions_locally is None:
                rerun_exceptions_locally = config.get(
                    "rerun_exceptions_locally", False)

            if state["waiting"] and not state["ready"]:
                raise ValueError("Found no accessible jobs in dask")

            def fire_task():
                """ Fire off a task to the thread pool """
                # Choose a good task to compute
                key = state["ready"].pop()
                state["running"].add(key)
                for f in pretask_cbs:
                    f(key, dsk, state)

                # Prep data to send
                data = {
                    dep: state["cache"][dep]
                    for dep in get_dependencies(dsk, key)
                }
                # Submit
                apply_async(
                    execute_task,
                    args=(
                        key,
                        dumps((dsk[key], data)),
                        dumps,
                        loads,
                        get_id,
                        pack_exception,
                    ),
                    callback=queue.put,
                )

            # Seed initial tasks into the thread pool
            while state["ready"] and len(state["running"]) < num_workers:
                fire_task()

            # Main loop, wait on tasks to finish, insert new ones
            while state["waiting"] or state["ready"] or state["running"]:
                key, res_info, failed = queue_get(queue)
                if failed:
                    exc, tb = loads(res_info)
                    if rerun_exceptions_locally:
                        data = {
                            dep: state["cache"][dep]
                            for dep in get_dependencies(dsk, key)
                        }
                        task = dsk[key]
                        _execute_task(task, data)  # Re-execute locally
                    else:
                        raise_exception(exc, tb)
                res, worker_id = loads(res_info)
                state["cache"][key] = res
                finish_task(dsk, key, state, results, keyorder.get)
                for f in posttask_cbs:
                    f(key, res, dsk, state, worker_id)

                while state["ready"] and len(state["running"]) < num_workers:
                    fire_task()

            succeeded = True

        finally:
            for _, _, _, _, finish in started_cbs:
                if finish:
                    finish(dsk, state, not succeeded)

    return nested_get(result, state["cache"])
コード例 #27
0
ファイル: test_core.py プロジェクト: benlewis-tes/dask
def test_get_dependencies_nested():
    dsk = {'x': 1, 'y': 2,
           'z': (add, (inc, [['x']]), 'y')}

    assert get_dependencies(dsk, 'z') == set(['x', 'y'])
コード例 #28
0
def test_get_dependencies_empty():
    dsk = {"x": (inc, )}
    assert get_dependencies(dsk, "x") == set()
    assert get_dependencies(dsk, "x", as_list=True) == []
コード例 #29
0
def test_get_dependencies_nested():
    dsk = {"x": 1, "y": 2, "z": (add, (inc, [["x"]]), "y")}

    assert get_dependencies(dsk, "z") == set(["x", "y"])
    assert sorted(get_dependencies(dsk, "z", as_list=True)) == ["x", "y"]
コード例 #30
0
def test_get_dependencies_task_none():
    # Regression test for https://github.com/dask/distributed/issues/2756
    dsk = {"foo": None}
    assert get_dependencies(dsk, task=dsk["foo"]) == set()
コード例 #31
0
def test_get_dependencies_nothing():
    with pytest.raises(ValueError):
        get_dependencies({})
コード例 #32
0
ファイル: test_core.py プロジェクト: pwolfram/dask
def test_get_dependencies_list():
    dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]}
    assert get_dependencies(dsk, 'z') == set(['x', 'y'])
コード例 #33
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_task():
    dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]}
    assert get_dependencies(dsk, task=(inc, 'x')) == set(['x'])
    assert get_dependencies(dsk, task=(inc, 'x'), as_list=True) == ['x']
コード例 #34
0
def fuse(
    dsk,
    keys=None,
    dependencies=None,
    ave_width=_default,
    max_width=_default,
    max_height=_default,
    max_depth_new_edges=_default,
    rename_keys=_default,
    fuse_subgraphs=_default,
):
    """Fuse tasks that form reductions; more advanced than ``fuse_linear``

    This trades parallelism opportunities for faster scheduling by making tasks
    less granular.  It can replace ``fuse_linear`` in optimization passes.

    This optimization applies to all reductions--tasks that have at most one
    dependent--so it may be viewed as fusing "multiple input, single output"
    groups of tasks into a single task.  There are many parameters to fine
    tune the behavior, which are described below.  ``ave_width`` is the
    natural parameter with which to compare parallelism to granularity, so
    it should always be specified.  Reasonable values for other parameters
    will be determined using ``ave_width`` if necessary.

    Parameters
    ----------
    dsk: dict
        dask graph
    keys: list or set, optional
        Keys that must remain in the returned dask graph
    dependencies: dict, optional
        {key: [list-of-keys]}.  Must be a list to provide count of each key
        This optional input often comes from ``cull``
    ave_width: float (default 1)
        Upper limit for ``width = num_nodes / height``, a good measure of
        parallelizability.
        dask.config key: ``optimization.fuse.ave-width``
    max_width: int (default infinite)
        Don't fuse if total width is greater than this.
        dask.config key: ``optimization.fuse.max-width``
    max_height: int or None (default None)
        Don't fuse more than this many levels. Set to None to dynamically
        adjust to ``1.5 + ave_width * log(ave_width + 1)``.
        dask.config key: ``optimization.fuse.max-height``
    max_depth_new_edges: int or None (default None)
        Don't fuse if new dependencies are added after this many levels.
        Set to None to dynamically adjust to ave_width * 1.5.
        dask.config key: ``optimization.fuse.max-depth-new-edges``
    rename_keys: bool or func, optional (default True)
        Whether to rename the fused keys with ``default_fused_keys_renamer``
        or not.  Renaming fused keys can keep the graph more understandable
        and comprehensive, but it comes at the cost of additional processing.
        If False, then the top-most key will be used.  For advanced usage, a
        function to create the new name is also accepted.
        dask.config key: ``optimization.fuse.rename-keys``
    fuse_subgraphs : bool or None, optional (default None)
        Whether to fuse multiple tasks into ``SubgraphCallable`` objects.
        Set to None to let the default optimizer of individual dask collections decide.
        If no collection-specific default exists, None defaults to False.
        dask.config key: ``optimization.fuse.subgraphs``

    Returns
    -------
    dsk
        output graph with keys fused
    dependencies
        dict mapping dependencies after fusion.  Useful side effect to accelerate other
        downstream optimizations.
    """

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk, dependencies

    if keys is not None and not isinstance(keys, set):
        if not isinstance(keys, list):
            keys = [keys]
        keys = set(flatten(keys))

    # Read defaults from dask.yaml and/or user-defined config file
    if ave_width is _default:
        ave_width = config.get("optimization.fuse.ave-width")
        assert ave_width is not _default
    if max_height is _default:
        max_height = config.get("optimization.fuse.max-height")
        assert max_height is not _default
    if max_depth_new_edges is _default:
        max_depth_new_edges = config.get("optimization.fuse.max-depth-new-edges")
        assert max_depth_new_edges is not _default
    if max_depth_new_edges is None:
        max_depth_new_edges = ave_width * 1.5
    if max_width is _default:
        max_width = config.get("optimization.fuse.max-width")
        assert max_width is not _default
    if max_width is None:
        max_width = 1.5 + ave_width * math.log(ave_width + 1)
    if fuse_subgraphs is _default:
        fuse_subgraphs = config.get("optimization.fuse.subgraphs")
        assert fuse_subgraphs is not _default
    if fuse_subgraphs is None:
        fuse_subgraphs = False

    if not ave_width or not max_height:
        return dsk, dependencies

    if rename_keys is _default:
        rename_keys = config.get("optimization.fuse.rename-keys")
        assert rename_keys is not _default
    if rename_keys is True:
        key_renamer = default_fused_keys_renamer
    elif rename_keys is False:
        key_renamer = None
    elif not callable(rename_keys):
        raise TypeError("rename_keys must be a boolean or callable")
    else:
        key_renamer = rename_keys
    rename_keys = key_renamer is not None

    if dependencies is None:
        deps = {k: get_dependencies(dsk, k, as_list=True) for k in dsk}
    else:
        deps = dict(dependencies)

    rdeps = {}
    for k, vals in deps.items():
        for v in vals:
            if v not in rdeps:
                rdeps[v] = [k]
            else:
                rdeps[v].append(k)
        deps[k] = set(vals)

    reducible = {k for k, vals in rdeps.items() if len(vals) == 1}
    if keys:
        reducible -= keys

    for k, v in dsk.items():
        if type(v) is not tuple and not isinstance(v, (numbers.Number, str)):
            reducible.discard(k)

    if not reducible and (
        not fuse_subgraphs or all(len(set(v)) != 1 for v in rdeps.values())
    ):
        # Quick return if there's nothing to do. Only progress if there's tasks
        # fusible by the main `fuse`, or by `fuse_subgraphs` if enabled.
        return dsk, deps

    rv = dsk.copy()
    fused_trees = {}
    # These are the stacks we use to store data as we traverse the graph
    info_stack = []
    children_stack = []
    # For speed
    deps_pop = deps.pop
    reducible_add = reducible.add
    reducible_pop = reducible.pop
    reducible_remove = reducible.remove
    fused_trees_pop = fused_trees.pop
    info_stack_append = info_stack.append
    info_stack_pop = info_stack.pop
    children_stack_append = children_stack.append
    children_stack_extend = children_stack.extend
    children_stack_pop = children_stack.pop
    while reducible:
        parent = reducible_pop()
        reducible_add(parent)
        while parent in reducible:
            # Go to the top
            parent = rdeps[parent][0]
        children_stack_append(parent)
        children_stack_extend(reducible & deps[parent])
        while True:
            child = children_stack[-1]
            if child != parent:
                children = reducible & deps[child]
                while children:
                    # Depth-first search
                    children_stack_extend(children)
                    parent = child
                    child = children_stack[-1]
                    children = reducible & deps[child]
                children_stack_pop()
                # This is a leaf node in the reduction region
                # key, task, fused_keys, height, width, number of nodes, fudge, set of edges
                info_stack_append(
                    (
                        child,
                        rv[child],
                        [child] if rename_keys else None,
                        1,
                        1,
                        1,
                        0,
                        deps[child] - reducible,
                    )
                )
            else:
                children_stack_pop()
                # Calculate metrics and fuse as appropriate
                deps_parent = deps[parent]
                edges = deps_parent - reducible
                children = deps_parent - edges
                num_children = len(children)

                if num_children == 1:
                    (
                        child_key,
                        child_task,
                        child_keys,
                        height,
                        width,
                        num_nodes,
                        fudge,
                        children_edges,
                    ) = info_stack_pop()
                    num_children_edges = len(children_edges)

                    if fudge > num_children_edges - 1 >= 0:
                        fudge = num_children_edges - 1
                    edges |= children_edges
                    no_new_edges = len(edges) == num_children_edges
                    if not no_new_edges:
                        fudge += 1
                    if (
                        (num_nodes + fudge) / height <= ave_width
                        and
                        # Sanity check; don't go too deep if new levels introduce new edge dependencies
                        (no_new_edges or height < max_depth_new_edges)
                    ):
                        # Perform substitutions as we go
                        val = subs(dsk[parent], child_key, child_task)
                        deps_parent.remove(child_key)
                        deps_parent |= deps_pop(child_key)
                        del rv[child_key]
                        reducible_remove(child_key)
                        if rename_keys:
                            child_keys.append(parent)
                            fused_trees[parent] = child_keys
                            fused_trees_pop(child_key, None)

                        if children_stack:
                            if no_new_edges:
                                # Linear fuse
                                info_stack_append(
                                    (
                                        parent,
                                        val,
                                        child_keys,
                                        height,
                                        width,
                                        num_nodes,
                                        fudge,
                                        edges,
                                    )
                                )
                            else:
                                info_stack_append(
                                    (
                                        parent,
                                        val,
                                        child_keys,
                                        height + 1,
                                        width,
                                        num_nodes + 1,
                                        fudge,
                                        edges,
                                    )
                                )
                        else:
                            rv[parent] = val
                            break
                    else:
                        rv[child_key] = child_task
                        reducible_remove(child_key)
                        if children_stack:
                            # Allow the parent to be fused, but only under strict circumstances.
                            # Ensure that linear chains may still be fused.
                            if fudge > int(ave_width - 1):
                                fudge = int(ave_width - 1)
                            # This task *implicitly* depends on `edges`
                            info_stack_append(
                                (
                                    parent,
                                    rv[parent],
                                    [parent] if rename_keys else None,
                                    1,
                                    width,
                                    1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            break
                else:
                    child_keys = []
                    height = 1
                    width = 0
                    num_single_nodes = 0
                    num_nodes = 0
                    fudge = 0
                    children_edges = set()
                    max_num_edges = 0
                    children_info = info_stack[-num_children:]
                    del info_stack[-num_children:]
                    for (
                        cur_key,
                        cur_task,
                        cur_keys,
                        cur_height,
                        cur_width,
                        cur_num_nodes,
                        cur_fudge,
                        cur_edges,
                    ) in children_info:
                        if cur_height == 1:
                            num_single_nodes += 1
                        elif cur_height > height:
                            height = cur_height
                        width += cur_width
                        num_nodes += cur_num_nodes
                        fudge += cur_fudge
                        if len(cur_edges) > max_num_edges:
                            max_num_edges = len(cur_edges)
                        children_edges |= cur_edges
                    # Fudge factor to account for possible parallelism with the boundaries
                    num_children_edges = len(children_edges)
                    fudge += min(
                        num_children - 1, max(0, num_children_edges - max_num_edges)
                    )

                    if fudge > num_children_edges - 1 >= 0:
                        fudge = num_children_edges - 1
                    edges |= children_edges
                    no_new_edges = len(edges) == num_children_edges
                    if not no_new_edges:
                        fudge += 1
                    if (
                        (num_nodes + fudge) / height <= ave_width
                        and num_single_nodes <= ave_width
                        and width <= max_width
                        and height <= max_height
                        and
                        # Sanity check; don't go too deep if new levels introduce new edge dependencies
                        (no_new_edges or height < max_depth_new_edges)
                    ):
                        # Perform substitutions as we go
                        val = dsk[parent]
                        children_deps = set()
                        for child_info in children_info:
                            cur_child = child_info[0]
                            val = subs(val, cur_child, child_info[1])
                            del rv[cur_child]
                            children_deps |= deps_pop(cur_child)
                            reducible_remove(cur_child)
                            if rename_keys:
                                fused_trees_pop(cur_child, None)
                                child_keys.extend(child_info[2])
                        deps_parent -= children
                        deps_parent |= children_deps

                        if rename_keys:
                            child_keys.append(parent)
                            fused_trees[parent] = child_keys

                        if children_stack:
                            info_stack_append(
                                (
                                    parent,
                                    val,
                                    child_keys,
                                    height + 1,
                                    width,
                                    num_nodes + 1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            rv[parent] = val
                            break
                    else:
                        for child_info in children_info:
                            rv[child_info[0]] = child_info[1]
                            reducible_remove(child_info[0])
                        if children_stack:
                            # Allow the parent to be fused, but only under strict circumstances.
                            # Ensure that linear chains may still be fused.
                            if width > max_width:
                                width = max_width
                            if fudge > int(ave_width - 1):
                                fudge = int(ave_width - 1)
                            # key, task, height, width, number of nodes, fudge, set of edges
                            # This task *implicitly* depends on `edges`
                            info_stack_append(
                                (
                                    parent,
                                    rv[parent],
                                    [parent] if rename_keys else None,
                                    1,
                                    width,
                                    1,
                                    fudge,
                                    edges,
                                )
                            )
                        else:
                            break
                # Traverse upwards
                parent = rdeps[parent][0]

    if fuse_subgraphs:
        _inplace_fuse_subgraphs(rv, keys, deps, fused_trees, rename_keys)

    if key_renamer:
        for root_key, fused_keys in fused_trees.items():
            alias = key_renamer(fused_keys)
            if alias is not None and alias not in rv:
                rv[alias] = rv[root_key]
                rv[root_key] = alias
                deps[alias] = deps[root_key]
                deps[root_key] = {alias}

    return rv, deps
コード例 #35
0
ファイル: test_optimization.py プロジェクト: rubenvdg/dask
def with_deps(dsk):
    return dsk, {k: get_dependencies(dsk, k) for k in dsk}
コード例 #36
0
def start_state_from_dask(dsk, cache=None, sortkey=None):
    """Start state from a dask

    Examples
    --------
    >>> inc = lambda x: x + 1
    >>> add = lambda x, y: x + y
    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}  # doctest: +SKIP
    >>> from pprint import pprint  # doctest: +SKIP
    >>> pprint(start_state_from_dask(dsk))  # doctest: +SKIP
    {'cache': {'x': 1, 'y': 2},
     'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}},
     'dependents': defaultdict(None, {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}}),
     'finished': set(),
     'ready': ['z'],
     'released': set(),
     'running': set(),
     'waiting': {'w': {'z'}},
     'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}}
    """
    if sortkey is None:
        sortkey = order(dsk).get
    if cache is None:
        cache = config.get("cache", None)
    if cache is None:
        cache = dict()
    data_keys = set()
    for k, v in dsk.items():
        if not has_tasks(dsk, v):
            cache[k] = v
            data_keys.add(k)

    dsk2 = dsk.copy()
    dsk2.update(cache)

    dependencies = {k: get_dependencies(dsk2, k) for k in dsk}
    waiting = {
        k: v.copy()
        for k, v in dependencies.items() if k not in data_keys
    }

    dependents = reverse_dict(dependencies)
    for a in cache:
        for b in dependents.get(a, ()):
            waiting[b].remove(a)
    waiting_data = {k: v.copy() for k, v in dependents.items() if v}

    ready_set = {k for k, v in waiting.items() if not v}
    ready = sorted(ready_set, key=sortkey, reverse=True)
    waiting = {k: v for k, v in waiting.items() if v}

    state = {
        "dependencies": dependencies,
        "dependents": dependents,
        "waiting": waiting,
        "waiting_data": waiting_data,
        "cache": cache,
        "ready": ready,
        "running": set(),
        "finished": set(),
        "released": set(),
    }

    return state
コード例 #37
0
def with_deps(dsk):
    return dsk, {k: get_dependencies(dsk, k) for k in dsk}
コード例 #38
0
ファイル: test_core.py プロジェクト: floriango/dask
def test_get_dependencies_nothing():
    with pytest.raises(ValueError):
        get_dependencies({})
コード例 #39
0
ファイル: scheduler.py プロジェクト: aterrel/distributed
def update_state(dsk, dependencies, dependents, held_data,
                 who_has, in_play,
                 waiting, waiting_data, new_dsk, new_keys):
    """ Update state given new dask graph and output keys

    This should operate in linear time relative to the size of edges of the
    added graph.  It assumes that the current runtime state is valid.
    """
    dsk.update(new_dsk)
    if not isinstance(new_keys, set):
        new_keys = set(new_keys)

    for key in new_dsk:  # add dependencies/dependents
        if key in dependencies:
            continue

        deps = get_dependencies(dsk, key)
        dependencies[key] = deps

        for dep in deps:
            if dep not in dependents:
                dependents[dep] = set()
            dependents[dep].add(key)

        if key not in dependents:
            dependents[key] = set()

    for key, value in new_dsk.items():  # add in remotedata
        vv, s = unpack_remotedata(value)
        if s:
            # TODO: check against in-memory, maybe add to in_play
            dsk[key] = vv
            dependencies[key] |= s
            for dep in s:
                if not dep in dependencies:
                    held_data.add(dep)
                    dependencies[dep] = set()
                if dep not in dependents:
                    dependents[dep] = set()
                dependents[dep].add(key)

    exterior = keys_outside_frontier(dsk, dependencies, new_keys, in_play)
    in_play |= exterior
    for key in exterior:
        deps = dependencies[key]
        waiting[key] = {d for d in deps if not (d in who_has and who_has[d])}
        for dep in deps:
            if dep not in waiting_data:
                waiting_data[dep] = set()
            waiting_data[dep].add(key)

        if key not in waiting_data:
            waiting_data[key] = set()

    held_data |= new_keys

    return {'dsk': dsk,
            'dependencies': dependencies,
            'dependents': dependents,
            'held_data': held_data,
            'waiting': waiting,
            'waiting_data': waiting_data}
コード例 #40
0
def test_get_dependencies_list():
    dsk = {"x": 1, "y": 2, "z": ["x", [(inc, "y")]]}
    assert get_dependencies(dsk, "z") == set(["x", "y"])
    assert sorted(get_dependencies(dsk, "z", as_list=True)) == ["x", "y"]
コード例 #41
0
ファイル: test_core.py プロジェクト: ankravch/dask
def test_get_dependencies_list():
    dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]}
    assert get_dependencies(dsk, 'z') == set(['x', 'y'])
コード例 #42
0
ファイル: test_core.py プロジェクト: benlewis-tes/dask
def test_get_dependencies_empty():
    dsk = {'x': (inc,)}
    assert get_dependencies(dsk, 'x') == set()
コード例 #43
0
ファイル: test_core.py プロジェクト: pwolfram/dask
def test_get_dependencies_nested():
    dsk = {'x': 1, 'y': 2, 'z': (add, (inc, [['x']]), 'y')}

    assert get_dependencies(dsk, 'z') == set(['x', 'y'])