def to_networkx(d, data_attributes=None, function_attributes=None): if data_attributes is None: data_attributes = dict() if function_attributes is None: function_attributes = dict() g = nx.DiGraph() for k, v in sorted(d.items(), key=lambda x: x[0]): g.add_node(k, shape='box', **data_attributes.get(k, dict())) if istask(v): func, args = v[0], v[1:] func_node = make_hashable((v, 'function')) g.add_node(func_node, shape='circle', label=name(func), **function_attributes.get(k, dict())) g.add_edge(func_node, k) for dep in sorted(get_dependencies(d, k)): arg2 = make_hashable(dep) g.add_node(arg2, label=str(dep), shape='box', **data_attributes.get(dep, dict())) g.add_edge(arg2, func_node) else: v_hash = make_hashable(v) if v_hash not in d: g.add_node(k, label='%s=%s' % (k, v), **data_attributes.get(k, dict())) else: # alias situation g.add_edge(v_hash, k) return g
def to_networkx(d, data_attributes=None, function_attributes=None): if data_attributes is None: data_attributes = dict() if function_attributes is None: function_attributes = dict() g = nx.DiGraph() for k, v in sorted(d.items(), key=lambda x: x[0]): g.add_node(k, shape='box', **data_attributes.get(k, dict())) if istask(v): func, args = v[0], v[1:] func_node = make_hashable((v, 'function')) g.add_node(func_node, shape='circle', label=name(func), **function_attributes.get(k, dict())) g.add_edge(func_node, k) for dep in sorted(get_dependencies(d, k)): arg2 = make_hashable(dep) g.add_node(arg2, label=str(dep), shape='box', **data_attributes.get(dep, dict())) g.add_edge(arg2, func_node) else: if v not in d: g.add_node(k, label='%s=%s' % (k, v), **data_attributes.get(k, dict())) else: # alias situation g.add_edge(v, k) return g
def fire_task(): """ Fire off a task to the thread pool """ # Choose a good task to compute key = state["ready"].pop() state["running"].add(key) for f in pretask_cbs: f(key, dsk, state) # Prep data to send data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } # Submit apply_async( execute_task, args=( key, dumps((dsk[key], data)), dumps, loads, get_id, pack_exception, ), callback=queue.put, )
def inline_functions(dsk, output, fast_functions=None, inline_constants=False, dependencies=None): """Inline cheap functions into larger operations Examples -------- >>> inc = lambda x: x + 1 >>> add = lambda x, y: x + y >>> double = lambda x: x * 2 >>> dsk = {'out': (add, 'i', 'd'), # doctest: +SKIP ... 'i': (inc, 'x'), ... 'd': (double, 'y'), ... 'x': 1, 'y': 1} >>> inline_functions(dsk, [], [inc]) # doctest: +SKIP {'out': (add, (inc, 'x'), 'd'), 'd': (double, 'y'), 'x': 1, 'y': 1} Protect output keys. In the example below ``i`` is not inlined because it is marked as an output key. >>> inline_functions(dsk, ['i', 'out'], [inc, double]) # doctest: +SKIP {'out': (add, 'i', (double, 'y')), 'i': (inc, 'x'), 'x': 1, 'y': 1} """ if not fast_functions: return dsk output = set(output) fast_functions = set(fast_functions) if dependencies is None: dependencies = {k: get_dependencies(dsk, k) for k in dsk} dependents = reverse_dict(dependencies) def inlinable(v): try: return functions_of(v).issubset(fast_functions) except TypeError: return False keys = [ k for k, v in dsk.items() if istask(v) and dependents[k] and k not in output and inlinable(v) ] if keys: dsk = inline(dsk, keys, inline_constants=inline_constants, dependencies=dependencies) for k in keys: del dsk[k] return dsk
def test_get_dependencies_many(): dsk = {'a': [1, 2, 3], 'b': 'a', 'c': [1, (inc, 1)], 'd': [(sum, 'c')], 'e': ['a', 'b', 'zzz'], 'f': [['a', 'b'], 2, 3]} tasks = [dsk[k] for k in ('d', 'f')] s = get_dependencies(dsk, task=tasks) assert s == {'a', 'b', 'c'} s = get_dependencies(dsk, task=tasks, as_list=True) assert sorted(s) == ['a', 'b', 'c'] s = get_dependencies(dsk, task=[]) assert s == set() s = get_dependencies(dsk, task=[], as_list=True) assert s == []
def _add_logging(dsk, ignore=None): """ Add logging to a Dask graph. @param dsk: The Dask graph. @return: New Dask graph. """ ctx = current_action() result = {} # Use topological sort to ensure Eliot actions are in logical order of # execution in Dask: keys = toposort(dsk) # Give each key a string name. Some keys are just aliases to other # keys, so make sure we have underlying key available. Later on might # want to shorten them as well. def simplify(k): if isinstance(k, str): return k return "-".join(str(o) for o in k) key_names = {} for key in keys: value = dsk[key] if not callable(value) and value in keys: # It's an alias for another key: key_names[key] = key_names[value] else: key_names[key] = simplify(key) # 2. Create Eliot child Actions for each key, in topological order: key_to_action_id = { key: str(ctx.serialize_task_id(), "utf-8") for key in keys } # 3. Replace function with wrapper that logs appropriate Action: for key in keys: func = dsk[key][0] args = dsk[key][1:] if not callable(func): # This key is just an alias for another key, no need to add # logging: result[key] = dsk[key] continue wrapped_func = _RunWithEliotContext( task_id=key_to_action_id[key], func=func, key=key_names[key], dependencies=[key_names[k] for k in get_dependencies(dsk, key)], ) result[key] = (wrapped_func, ) + tuple(args) assert result.keys() == dsk.keys() return result
def test_get_dependencies_many(): dsk = { 'a': [1, 2, 3], 'b': 'a', 'c': [1, (inc, 1)], 'd': [(sum, 'c')], 'e': ['a', 'b', 'zzz'], 'f': [['a', 'b'], 2, 3] } tasks = [dsk[k] for k in ('d', 'f')] s = get_dependencies(dsk, task=tasks) assert s == {'a', 'b', 'c'} s = get_dependencies(dsk, task=tasks, as_list=True) assert sorted(s) == ['a', 'b', 'c'] s = get_dependencies(dsk, task=[]) assert s == set() s = get_dependencies(dsk, task=[], as_list=True) assert s == []
def dask_to_digraph(dsk): from networkx import DiGraph from dask.core import get_dependencies g = DiGraph() for key, value in dsk.items(): g.add_node(key, dict(func=value)) for key, value in dsk.items(): g.add_node(key, dict(func=value)) for dep in get_dependencies(dsk, key): g.add_edge(dep, key) return g
def test_get_dependencies_many(): dsk = { "a": [1, 2, 3], "b": "a", "c": [1, (inc, 1)], "d": [(sum, "c")], "e": ["a", "b", "zzz"], "f": [["a", "b"], 2, 3], } tasks = [dsk[k] for k in ("d", "f")] s = get_dependencies(dsk, task=tasks) assert s == {"a", "b", "c"} s = get_dependencies(dsk, task=tasks, as_list=True) assert sorted(s) == ["a", "b", "c"] s = get_dependencies(dsk, task=[]) assert s == set() s = get_dependencies(dsk, task=[], as_list=True) assert s == []
def _add_logging(dsk, ignore=None): """ Add logging to a Dask graph. @param dsk: The Dask graph. @return: New Dask graph. """ ctx = current_action() result = {} # Use topological sort to ensure Eliot actions are in logical order of # execution in Dask: keys = toposort(dsk) # Give each key a string name. Some keys are just aliases to other # keys, so make sure we have underlying key available. Later on might # want to shorten them as well. def simplify(k): if isinstance(k, str): return k return "-".join(str(o) for o in k) key_names = {} for key in keys: value = dsk[key] if not callable(value) and value in keys: # It's an alias for another key: key_names[key] = key_names[value] else: key_names[key] = simplify(key) # 2. Create Eliot child Actions for each key, in topological order: key_to_action_id = {key: str(ctx.serialize_task_id(), "utf-8") for key in keys} # 3. Replace function with wrapper that logs appropriate Action: for key in keys: func = dsk[key][0] args = dsk[key][1:] if not callable(func): # This key is just an alias for another key, no need to add # logging: result[key] = dsk[key] continue wrapped_func = _RunWithEliotContext( task_id=key_to_action_id[key], func=func, key=key_names[key], dependencies=[key_names[k] for k in get_dependencies(dsk, key)], ) result[key] = (wrapped_func,) + tuple(args) assert result.keys() == dsk.keys() return result
def simple_vis(x, filename='simple', format=None): if hasattr(x, 'dask'): dsk = x._optimize(x.dask, x._keys()) else: dsk = x deps = {k: get_dependencies(dsk, k) for k in dsk} g = graphviz.Digraph(graph_attr={'rankdir': 'LR'}) nodes = set() edges = set() for k in dsk: key = node_key(k) if key not in nodes: g.node(key, label=key_split(k), shape='rectangle') nodes.add(key) for dep in deps[k]: dep_key = node_key(dep) if dep_key not in nodes: g.node(dep_key, label=key_split(dep), shape='rectangle') nodes.add(dep_key) # Avoid circular references if dep_key != key and (dep_key, key) not in edges: g.edge(dep_key, key) edges.add((dep_key, key)) fmts = ['.png', '.pdf', '.dot', '.svg', '.jpeg', '.jpg'] if format is None and any(filename.lower().endswith(fmt) for fmt in fmts): filename, format = os.path.splitext(filename) format = format[1:].lower() if format is None: format = 'png' data = g.pipe(format=format) if not data: raise RuntimeError("Graphviz failed to properly produce an image. " "This probably means your installation of graphviz " "is missing png support. See: " "https://github.com/ContinuumIO/anaconda-issues/" "issues/485 for more information.") display_cls = _get_display_cls(format) if not filename: return display_cls(data=data) full_filename = '.'.join([filename, format]) with open(full_filename, 'wb') as f: f.write(data) return display_cls(filename=full_filename)
def cull(dsk, keys): """Return new dask with only the tasks required to calculate keys. In other words, remove unnecessary tasks from dask. ``keys`` may be a single key or list of keys. Examples -------- >>> def inc(x): ... return x + 1 >>> def add(x, y): ... return x + y >>> d = {'x': 1, 'y': (inc, 'x'), 'out': (add, 'x', 10)} >>> dsk, dependencies = cull(d, 'out') >>> dsk # doctest: +ELLIPSIS {'out': (<function add at ...>, 'x', 10), 'x': 1} >>> dependencies # doctest: +ELLIPSIS {'out': ['x'], 'x': []} Returns ------- dsk: culled dask graph dependencies: Dict mapping {key: [deps]}. Useful side effect to accelerate other optimizations, notably fuse. """ if not isinstance(keys, (list, set)): keys = [keys] seen = set() dependencies = dict() out = {} work = list(set(flatten(keys))) while work: new_work = [] for k in work: dependencies_k = get_dependencies(dsk, k, as_list=True) # fuse needs lists out[k] = dsk[k] dependencies[k] = dependencies_k for d in dependencies_k: if d not in seen: seen.add(d) new_work.append(d) work = new_work return out, dependencies
def fire_tasks(chunksize): """Fire off a task to the thread pool""" # Determine chunksize and/or number of tasks to submit nready = len(state["ready"]) if chunksize == -1: ntasks = nready chunksize = -(ntasks // -num_workers) else: used_workers = -(len(state["running"]) // -chunksize) avail_workers = max(num_workers - used_workers, 0) ntasks = min(nready, chunksize * avail_workers) # Prep all ready tasks for submission args = [] for _ in range(ntasks): # Get the next task to compute (most recently added) key = state["ready"].pop() # Notify task is running state["running"].add(key) for f in pretask_cbs: f(key, dsk, state) # Prep args to send data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } args.append(( key, dumps((dsk[key], data)), dumps, loads, get_id, pack_exception, )) # Batch submit for i in range(-(len(args) // -chunksize)): each_args = args[i * chunksize:(i + 1) * chunksize] if not each_args: break fut = submit(batch_execute_tasks, each_args) fut.add_done_callback(queue.put)
def maybe_wrap(key, value): if isinstance(value, list): return [maybe_wrap(key, v) for v in value] elif isinstance(value, tuple): func = value[0] args = value[1:] if not callable(func): # Not a callable, so nothing to wrap. return value wrapped_func = _RunWithEliotContext( task_id=str(ctx.serialize_task_id(), "utf-8"), func=func, key=key_names[key], dependencies=[ key_names[k] for k in get_dependencies(dsk, key) ], ) return (wrapped_func, ) + args else: return value
def visualize(self, x, filename="simple_computation_graph", format=None): if hasattr(x, "dask"): dsk = x.__dask_optimize__(x.dask, x.__dask_keys__()) else: dsk = x deps = {k: get_dependencies(dsk, k) for k in dsk} g = graphviz.Digraph(araph_attr={"rankdir": "LR"}) nodes = set() edges = set() for k in dsk: key = self._node_key(k) if key not in nodes: g.node(key, label=key_split(k), shape="rectangle") nodes.add(key) for dep in deps[k]: dep_key = self._node_key(dep) if dep_key not in nodes: g.node(dep_key, label=key_split(dep), shape="rectangle") nodes.add(dep_key) # Avoid circular references if dep_key != key and (dep_key, key) not in edges: g.edge(dep_key, key) edges.add((dep_key, key)) data = g.pipe(format=self.format) display_cls = _get_display_cls(self.format) if self.filename is None: return display_cls(data=data) full_filename = ".".join([filename, self.format]) with open(full_filename, "wb") as f: f.write(data) return display_cls(filename=full_filename)
def dask_to_dataflow(dsk): from dask.core import istask, get_dependencies g = DataflowEnvironment() for k, v in dsk.items(): k_name = k g.add_node(k_name, v[0]) for k, v in dsk.items(): k_name = k if callable(v[0]): deps = list(get_dependencies(dsk, k)) print k_name, v[0], deps g.add_node( k_name, v[0], args=['args_{}'.format(i) for i, dep in enumerate(deps)]) for i, dep in enumerate(deps): g.add_edge(dep, k_name, dfpd.OBJ_CALL_RETS, 'args_{}'.format(i)) if 'sum_alldata' in g: g.add_edge_call_rets('sum_alldata') g.start() return g
def setup(self): cols = 1000 rows = 10 width = 3 height = 2 dsk = {} inline_keys = set() for r in range(rows): if r == 0: update = {("x", r, c): 0 for c in range(cols)} x = "x" elif not (r - 1) % (height + 1): # Start of a diamond update = { ("add", r, c, w): (add, (x, r - 1, c), w) for w in range(width) for c in range(cols) } x = "add" elif r % (height + 1): # In a diamond update = { ("inc", r, c, w): (inc, (x, r - 1, c, w)) for w in range(width) for c in range(cols) } inline_keys.update(update) x = "inc" else: # End of a diamond update = { ("sum", r, c): (sum, [(x, r - 1, c, w) for w in range(width)]) for c in range(cols) } x = "sum" dsk.update(update) keys = list(update) self.dsk = dsk self.keys = keys self.inline_keys = inline_keys self.deps = {k: get_dependencies(dsk, k, as_list=False) for k in dsk}
def test_get_dependencies_empty(): dsk = {'x': (inc, )} assert get_dependencies(dsk, 'x') == set()
def test_get_dependencies_list(): dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]} assert get_dependencies(dsk, 'z') == set(['x', 'y']) assert sorted(get_dependencies(dsk, 'z', as_list=True)) == ['x', 'y']
def test_get_dependencies_empty(): dsk = {'x': (inc,)} assert get_dependencies(dsk, 'x') == set() assert get_dependencies(dsk, 'x', as_list=True) == []
def test_get_dependencies_nested(): dsk = {'x': 1, 'y': 2, 'z': (add, (inc, [['x']]), 'y')} assert get_dependencies(dsk, 'z') == set(['x', 'y']) assert sorted(get_dependencies(dsk, 'z', as_list=True)) == ['x', 'y']
def test_get_dependencies_task(): dsk = {"x": 1, "y": 2, "z": ["x", [(inc, "y")]]} assert get_dependencies(dsk, task=(inc, "x")) == set(["x"]) assert get_dependencies(dsk, task=(inc, "x"), as_list=True) == ["x"]
def get_async( submit, num_workers, dsk, result, cache=None, get_id=default_get_id, rerun_exceptions_locally=None, pack_exception=default_pack_exception, raise_exception=reraise, callbacks=None, dumps=identity, loads=identity, chunksize=None, **kwargs, ): """Asynchronous get function This is a general version of various asynchronous schedulers for dask. It takes a ``concurrent.futures.Executor.submit`` function to form a more specific ``get`` method that walks through the dask array with parallel workers, avoiding repeat computation and minimizing memory use. Parameters ---------- submit : function A ``concurrent.futures.Executor.submit`` function num_workers : int The number of workers that task submissions can be spread over dsk : dict A dask dictionary specifying a workflow result : key or list of keys Keys corresponding to desired data cache : dict-like, optional Temporary storage of results get_id : callable, optional Function to return the worker id, takes no arguments. Examples are `threading.current_thread` and `multiprocessing.current_process`. rerun_exceptions_locally : bool, optional Whether to rerun failing tasks in local process to enable debugging (False by default) pack_exception : callable, optional Function to take an exception and ``dumps`` method, and return a serialized tuple of ``(exception, traceback)`` to send back to the scheduler. Default is to just raise the exception. raise_exception : callable, optional Function that takes an exception and a traceback, and raises an error. callbacks : tuple or list of tuples, optional Callbacks are passed in as tuples of length 5. Multiple sets of callbacks may be passed in as a list of tuples. For more information, see the dask.diagnostics documentation. dumps: callable, optional Function to serialize task data and results to communicate between worker and parent. Defaults to identity. loads: callable, optional Inverse function of `dumps`. Defaults to identity. chunksize: int, optional Size of chunks to use when dispatching work. Defaults to 1. If -1, will be computed to evenly divide ready work across workers. See Also -------- threaded.get """ chunksize = chunksize or config.get("chunksize", 1) queue = Queue() if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = {result} results = set(result_flat) dsk = dict(dsk) with local_callbacks(callbacks) as callbacks: _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks) started_cbs = [] succeeded = False # if start_state_from_dask fails, we will have something # to pass to the final block. state = {} try: for cb in callbacks: if cb[0]: cb[0](dsk) started_cbs.append(cb) keyorder = order(dsk) state = start_state_from_dask(dsk, cache=cache, sortkey=keyorder.get) for _, start_state, _, _, _ in callbacks: if start_state: start_state(dsk, state) if rerun_exceptions_locally is None: rerun_exceptions_locally = config.get( "rerun_exceptions_locally", False) if state["waiting"] and not state["ready"]: raise ValueError("Found no accessible jobs in dask") def fire_tasks(chunksize): """Fire off a task to the thread pool""" # Determine chunksize and/or number of tasks to submit nready = len(state["ready"]) if chunksize == -1: ntasks = nready chunksize = -(ntasks // -num_workers) else: used_workers = -(len(state["running"]) // -chunksize) avail_workers = max(num_workers - used_workers, 0) ntasks = min(nready, chunksize * avail_workers) # Prep all ready tasks for submission args = [] for _ in range(ntasks): # Get the next task to compute (most recently added) key = state["ready"].pop() # Notify task is running state["running"].add(key) for f in pretask_cbs: f(key, dsk, state) # Prep args to send data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } args.append(( key, dumps((dsk[key], data)), dumps, loads, get_id, pack_exception, )) # Batch submit for i in range(-(len(args) // -chunksize)): each_args = args[i * chunksize:(i + 1) * chunksize] if not each_args: break fut = submit(batch_execute_tasks, each_args) fut.add_done_callback(queue.put) # Main loop, wait on tasks to finish, insert new ones while state["waiting"] or state["ready"] or state["running"]: fire_tasks(chunksize) for key, res_info, failed in queue_get(queue).result(): if failed: exc, tb = loads(res_info) if rerun_exceptions_locally: data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } task = dsk[key] _execute_task(task, data) # Re-execute locally else: raise_exception(exc, tb) res, worker_id = loads(res_info) state["cache"][key] = res finish_task(dsk, key, state, results, keyorder.get) for f in posttask_cbs: f(key, res, dsk, state, worker_id) succeeded = True finally: for _, _, _, _, finish in started_cbs: if finish: finish(dsk, state, not succeeded) return nested_get(result, state["cache"])
def fuse_linear(dsk, keys=None, dependencies=None, rename_keys=True): """Return new dask graph with linear sequence of tasks fused together. If specified, the keys in ``keys`` keyword argument are *not* fused. Supply ``dependencies`` from output of ``cull`` if available to avoid recomputing dependencies. **This function is mostly superseded by ``fuse``** Parameters ---------- dsk: dict keys: list dependencies: dict, optional {key: [list-of-keys]}. Must be a list to provide count of each key This optional input often comes from ``cull`` rename_keys: bool or func, optional Whether to rename fused keys with ``default_fused_linear_keys_renamer`` or not. Renaming fused keys can keep the graph more understandable and comprehensive, but it comes at the cost of additional processing. If False, then the top-most key will be used. For advanced usage, a func is also accepted, ``new_key = rename_keys(fused_key_list)``. Examples -------- >>> def inc(x): ... return x + 1 >>> def add(x, y): ... return x + y >>> d = {'a': 1, 'b': (inc, 'a'), 'c': (inc, 'b')} >>> dsk, dependencies = fuse(d) >>> dsk # doctest: +SKIP {'a-b-c': (inc, (inc, 1)), 'c': 'a-b-c'} >>> dsk, dependencies = fuse(d, rename_keys=False) >>> dsk # doctest: +ELLIPSIS {'c': (<function inc at ...>, (<function inc at ...>, 1))} >>> dsk, dependencies = fuse(d, keys=['b'], rename_keys=False) >>> dsk # doctest: +ELLIPSIS {'b': (<function inc at ...>, 1), 'c': (<function inc at ...>, 'b')} Returns ------- dsk: output graph with keys fused dependencies: dict mapping dependencies after fusion. Useful side effect to accelerate other downstream optimizations. """ if keys is not None and not isinstance(keys, set): if not isinstance(keys, list): keys = [keys] keys = set(flatten(keys)) if dependencies is None: dependencies = {k: get_dependencies(dsk, k, as_list=True) for k in dsk} # locate all members of linear chains child2parent = {} unfusible = set() for parent in dsk: deps = dependencies[parent] has_many_children = len(deps) > 1 for child in deps: if keys is not None and child in keys: unfusible.add(child) elif child in child2parent: del child2parent[child] unfusible.add(child) elif has_many_children: unfusible.add(child) elif child not in unfusible: child2parent[child] = parent # construct the chains from ancestor to descendant chains = [] parent2child = dict(map(reversed, child2parent.items())) while child2parent: child, parent = child2parent.popitem() chain = [child, parent] while parent in child2parent: parent = child2parent.pop(parent) del parent2child[parent] chain.append(parent) chain.reverse() while child in parent2child: child = parent2child.pop(child) del child2parent[child] chain.append(child) chains.append(chain) dependencies = {k: set(v) for k, v in dependencies.items()} if rename_keys is True: key_renamer = default_fused_linear_keys_renamer elif rename_keys is False: key_renamer = None else: key_renamer = rename_keys # create a new dask with fused chains rv = {} fused = set() aliases = set() is_renamed = False for chain in chains: if key_renamer is not None: new_key = key_renamer(chain) is_renamed = ( new_key is not None and new_key not in dsk and new_key not in rv ) child = chain.pop() val = dsk[child] while chain: parent = chain.pop() dependencies[parent].update(dependencies.pop(child)) dependencies[parent].remove(child) val = subs(dsk[parent], child, val) fused.add(child) child = parent fused.add(child) if is_renamed: rv[new_key] = val rv[child] = new_key dependencies[new_key] = dependencies[child] dependencies[child] = {new_key} aliases.add(child) else: rv[child] = val for key, val in dsk.items(): if key not in fused: rv[key] = val if aliases: for key, deps in dependencies.items(): for old_key in deps & aliases: new_key = rv[old_key] deps.remove(old_key) deps.add(new_key) rv[key] = subs(rv[key], old_key, new_key) if keys is not None: for key in aliases - keys: del rv[key] del dependencies[key] return rv, dependencies
def inline(dsk, keys=None, inline_constants=True, dependencies=None): """Return new dask with the given keys inlined with their values. Inlines all constants if ``inline_constants`` keyword is True. Note that the constant keys will remain in the graph, to remove them follow ``inline`` with ``cull``. Examples -------- >>> def inc(x): ... return x + 1 >>> def add(x, y): ... return x + y >>> d = {'x': 1, 'y': (inc, 'x'), 'z': (add, 'x', 'y')} >>> inline(d) # doctest: +ELLIPSIS {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, 'y')} >>> inline(d, keys='y') # doctest: +ELLIPSIS {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, (<function inc at ...>, 1))} >>> inline(d, keys='y', inline_constants=False) # doctest: +ELLIPSIS {'x': 1, 'y': (<function inc at ...>, 'x'), 'z': (<function add at ...>, 'x', (<function inc at ...>, 'x'))} """ if dependencies and isinstance(next(iter(dependencies.values())), list): dependencies = {k: set(v) for k, v in dependencies.items()} keys = _flat_set(keys) if dependencies is None: dependencies = {k: get_dependencies(dsk, k) for k in dsk} if inline_constants: keys.update( k for k, v in dsk.items() if (ishashable(v) and v in dsk) or (not dependencies[k] and not istask(v)) ) # Keys may depend on other keys, so determine replace order with toposort. # The values stored in `keysubs` do not include other keys. replaceorder = toposort( {k: dsk[k] for k in keys if k in dsk}, dependencies=dependencies ) keysubs = {} for key in replaceorder: val = dsk[key] for dep in keys & dependencies[key]: if dep in keysubs: replace = keysubs[dep] else: replace = dsk[dep] val = subs(val, dep, replace) keysubs[key] = val # Make new dask with substitutions dsk2 = keysubs.copy() for key, val in dsk.items(): if key not in dsk2: for item in keys & dependencies[key]: val = subs(val, item, keysubs[item]) dsk2[key] = val return dsk2
def get_async(apply_async, num_workers, dsk, result, cache=None, get_id=default_get_id, rerun_exceptions_locally=None, pack_exception=default_pack_exception, raise_exception=reraise, callbacks=None, dumps=identity, loads=identity, **kwargs): """Asynchronous get function This is a general version of various asynchronous schedulers for dask. It takes a an apply_async function as found on Pool objects to form a more specific ``get`` method that walks through the dask array with parallel workers, avoiding repeat computation and minimizing memory use. Parameters ---------- apply_async : function Asynchronous apply function as found on Pool or ThreadPool num_workers : int The number of active tasks we should have at any one time dsk : dict A dask dictionary specifying a workflow result : key or list of keys Keys corresponding to desired data cache : dict-like, optional Temporary storage of results get_id : callable, optional Function to return the worker id, takes no arguments. Examples are `threading.current_thread` and `multiprocessing.current_process`. rerun_exceptions_locally : bool, optional Whether to rerun failing tasks in local process to enable debugging (False by default) pack_exception : callable, optional Function to take an exception and ``dumps`` method, and return a serialized tuple of ``(exception, traceback)`` to send back to the scheduler. Default is to just raise the exception. raise_exception : callable, optional Function that takes an exception and a traceback, and raises an error. dumps: callable, optional Function to serialize task data and results to communicate between worker and parent. Defaults to identity. loads: callable, optional Inverse function of `dumps`. Defaults to identity. callbacks : tuple or list of tuples, optional Callbacks are passed in as tuples of length 5. Multiple sets of callbacks may be passed in as a list of tuples. For more information, see the dask.diagnostics documentation. See Also -------- threaded.get """ queue = Queue() if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = {result} results = set(result_flat) dsk = dict(dsk) with local_callbacks(callbacks) as callbacks: _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks) started_cbs = [] succeeded = False # if start_state_from_dask fails, we will have something # to pass to the final block. state = {} try: for cb in callbacks: if cb[0]: cb[0](dsk) started_cbs.append(cb) keyorder = order(dsk) state = start_state_from_dask(dsk, cache=cache, sortkey=keyorder.get) for _, start_state, _, _, _ in callbacks: if start_state: start_state(dsk, state) if rerun_exceptions_locally is None: rerun_exceptions_locally = config.get( "rerun_exceptions_locally", False) if state["waiting"] and not state["ready"]: raise ValueError("Found no accessible jobs in dask") def fire_task(): """ Fire off a task to the thread pool """ # Choose a good task to compute key = state["ready"].pop() state["running"].add(key) for f in pretask_cbs: f(key, dsk, state) # Prep data to send data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } # Submit apply_async( execute_task, args=( key, dumps((dsk[key], data)), dumps, loads, get_id, pack_exception, ), callback=queue.put, ) # Seed initial tasks into the thread pool while state["ready"] and len(state["running"]) < num_workers: fire_task() # Main loop, wait on tasks to finish, insert new ones while state["waiting"] or state["ready"] or state["running"]: key, res_info, failed = queue_get(queue) if failed: exc, tb = loads(res_info) if rerun_exceptions_locally: data = { dep: state["cache"][dep] for dep in get_dependencies(dsk, key) } task = dsk[key] _execute_task(task, data) # Re-execute locally else: raise_exception(exc, tb) res, worker_id = loads(res_info) state["cache"][key] = res finish_task(dsk, key, state, results, keyorder.get) for f in posttask_cbs: f(key, res, dsk, state, worker_id) while state["ready"] and len(state["running"]) < num_workers: fire_task() succeeded = True finally: for _, _, _, _, finish in started_cbs: if finish: finish(dsk, state, not succeeded) return nested_get(result, state["cache"])
def test_get_dependencies_nested(): dsk = {'x': 1, 'y': 2, 'z': (add, (inc, [['x']]), 'y')} assert get_dependencies(dsk, 'z') == set(['x', 'y'])
def test_get_dependencies_empty(): dsk = {"x": (inc, )} assert get_dependencies(dsk, "x") == set() assert get_dependencies(dsk, "x", as_list=True) == []
def test_get_dependencies_nested(): dsk = {"x": 1, "y": 2, "z": (add, (inc, [["x"]]), "y")} assert get_dependencies(dsk, "z") == set(["x", "y"]) assert sorted(get_dependencies(dsk, "z", as_list=True)) == ["x", "y"]
def test_get_dependencies_task_none(): # Regression test for https://github.com/dask/distributed/issues/2756 dsk = {"foo": None} assert get_dependencies(dsk, task=dsk["foo"]) == set()
def test_get_dependencies_nothing(): with pytest.raises(ValueError): get_dependencies({})
def test_get_dependencies_list(): dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]} assert get_dependencies(dsk, 'z') == set(['x', 'y'])
def test_get_dependencies_task(): dsk = {'x': 1, 'y': 2, 'z': ['x', [(inc, 'y')]]} assert get_dependencies(dsk, task=(inc, 'x')) == set(['x']) assert get_dependencies(dsk, task=(inc, 'x'), as_list=True) == ['x']
def fuse( dsk, keys=None, dependencies=None, ave_width=_default, max_width=_default, max_height=_default, max_depth_new_edges=_default, rename_keys=_default, fuse_subgraphs=_default, ): """Fuse tasks that form reductions; more advanced than ``fuse_linear`` This trades parallelism opportunities for faster scheduling by making tasks less granular. It can replace ``fuse_linear`` in optimization passes. This optimization applies to all reductions--tasks that have at most one dependent--so it may be viewed as fusing "multiple input, single output" groups of tasks into a single task. There are many parameters to fine tune the behavior, which are described below. ``ave_width`` is the natural parameter with which to compare parallelism to granularity, so it should always be specified. Reasonable values for other parameters will be determined using ``ave_width`` if necessary. Parameters ---------- dsk: dict dask graph keys: list or set, optional Keys that must remain in the returned dask graph dependencies: dict, optional {key: [list-of-keys]}. Must be a list to provide count of each key This optional input often comes from ``cull`` ave_width: float (default 1) Upper limit for ``width = num_nodes / height``, a good measure of parallelizability. dask.config key: ``optimization.fuse.ave-width`` max_width: int (default infinite) Don't fuse if total width is greater than this. dask.config key: ``optimization.fuse.max-width`` max_height: int or None (default None) Don't fuse more than this many levels. Set to None to dynamically adjust to ``1.5 + ave_width * log(ave_width + 1)``. dask.config key: ``optimization.fuse.max-height`` max_depth_new_edges: int or None (default None) Don't fuse if new dependencies are added after this many levels. Set to None to dynamically adjust to ave_width * 1.5. dask.config key: ``optimization.fuse.max-depth-new-edges`` rename_keys: bool or func, optional (default True) Whether to rename the fused keys with ``default_fused_keys_renamer`` or not. Renaming fused keys can keep the graph more understandable and comprehensive, but it comes at the cost of additional processing. If False, then the top-most key will be used. For advanced usage, a function to create the new name is also accepted. dask.config key: ``optimization.fuse.rename-keys`` fuse_subgraphs : bool or None, optional (default None) Whether to fuse multiple tasks into ``SubgraphCallable`` objects. Set to None to let the default optimizer of individual dask collections decide. If no collection-specific default exists, None defaults to False. dask.config key: ``optimization.fuse.subgraphs`` Returns ------- dsk output graph with keys fused dependencies dict mapping dependencies after fusion. Useful side effect to accelerate other downstream optimizations. """ # Perform low-level fusion unless the user has # specified False explicitly. if config.get("optimization.fuse.active") is False: return dsk, dependencies if keys is not None and not isinstance(keys, set): if not isinstance(keys, list): keys = [keys] keys = set(flatten(keys)) # Read defaults from dask.yaml and/or user-defined config file if ave_width is _default: ave_width = config.get("optimization.fuse.ave-width") assert ave_width is not _default if max_height is _default: max_height = config.get("optimization.fuse.max-height") assert max_height is not _default if max_depth_new_edges is _default: max_depth_new_edges = config.get("optimization.fuse.max-depth-new-edges") assert max_depth_new_edges is not _default if max_depth_new_edges is None: max_depth_new_edges = ave_width * 1.5 if max_width is _default: max_width = config.get("optimization.fuse.max-width") assert max_width is not _default if max_width is None: max_width = 1.5 + ave_width * math.log(ave_width + 1) if fuse_subgraphs is _default: fuse_subgraphs = config.get("optimization.fuse.subgraphs") assert fuse_subgraphs is not _default if fuse_subgraphs is None: fuse_subgraphs = False if not ave_width or not max_height: return dsk, dependencies if rename_keys is _default: rename_keys = config.get("optimization.fuse.rename-keys") assert rename_keys is not _default if rename_keys is True: key_renamer = default_fused_keys_renamer elif rename_keys is False: key_renamer = None elif not callable(rename_keys): raise TypeError("rename_keys must be a boolean or callable") else: key_renamer = rename_keys rename_keys = key_renamer is not None if dependencies is None: deps = {k: get_dependencies(dsk, k, as_list=True) for k in dsk} else: deps = dict(dependencies) rdeps = {} for k, vals in deps.items(): for v in vals: if v not in rdeps: rdeps[v] = [k] else: rdeps[v].append(k) deps[k] = set(vals) reducible = {k for k, vals in rdeps.items() if len(vals) == 1} if keys: reducible -= keys for k, v in dsk.items(): if type(v) is not tuple and not isinstance(v, (numbers.Number, str)): reducible.discard(k) if not reducible and ( not fuse_subgraphs or all(len(set(v)) != 1 for v in rdeps.values()) ): # Quick return if there's nothing to do. Only progress if there's tasks # fusible by the main `fuse`, or by `fuse_subgraphs` if enabled. return dsk, deps rv = dsk.copy() fused_trees = {} # These are the stacks we use to store data as we traverse the graph info_stack = [] children_stack = [] # For speed deps_pop = deps.pop reducible_add = reducible.add reducible_pop = reducible.pop reducible_remove = reducible.remove fused_trees_pop = fused_trees.pop info_stack_append = info_stack.append info_stack_pop = info_stack.pop children_stack_append = children_stack.append children_stack_extend = children_stack.extend children_stack_pop = children_stack.pop while reducible: parent = reducible_pop() reducible_add(parent) while parent in reducible: # Go to the top parent = rdeps[parent][0] children_stack_append(parent) children_stack_extend(reducible & deps[parent]) while True: child = children_stack[-1] if child != parent: children = reducible & deps[child] while children: # Depth-first search children_stack_extend(children) parent = child child = children_stack[-1] children = reducible & deps[child] children_stack_pop() # This is a leaf node in the reduction region # key, task, fused_keys, height, width, number of nodes, fudge, set of edges info_stack_append( ( child, rv[child], [child] if rename_keys else None, 1, 1, 1, 0, deps[child] - reducible, ) ) else: children_stack_pop() # Calculate metrics and fuse as appropriate deps_parent = deps[parent] edges = deps_parent - reducible children = deps_parent - edges num_children = len(children) if num_children == 1: ( child_key, child_task, child_keys, height, width, num_nodes, fudge, children_edges, ) = info_stack_pop() num_children_edges = len(children_edges) if fudge > num_children_edges - 1 >= 0: fudge = num_children_edges - 1 edges |= children_edges no_new_edges = len(edges) == num_children_edges if not no_new_edges: fudge += 1 if ( (num_nodes + fudge) / height <= ave_width and # Sanity check; don't go too deep if new levels introduce new edge dependencies (no_new_edges or height < max_depth_new_edges) ): # Perform substitutions as we go val = subs(dsk[parent], child_key, child_task) deps_parent.remove(child_key) deps_parent |= deps_pop(child_key) del rv[child_key] reducible_remove(child_key) if rename_keys: child_keys.append(parent) fused_trees[parent] = child_keys fused_trees_pop(child_key, None) if children_stack: if no_new_edges: # Linear fuse info_stack_append( ( parent, val, child_keys, height, width, num_nodes, fudge, edges, ) ) else: info_stack_append( ( parent, val, child_keys, height + 1, width, num_nodes + 1, fudge, edges, ) ) else: rv[parent] = val break else: rv[child_key] = child_task reducible_remove(child_key) if children_stack: # Allow the parent to be fused, but only under strict circumstances. # Ensure that linear chains may still be fused. if fudge > int(ave_width - 1): fudge = int(ave_width - 1) # This task *implicitly* depends on `edges` info_stack_append( ( parent, rv[parent], [parent] if rename_keys else None, 1, width, 1, fudge, edges, ) ) else: break else: child_keys = [] height = 1 width = 0 num_single_nodes = 0 num_nodes = 0 fudge = 0 children_edges = set() max_num_edges = 0 children_info = info_stack[-num_children:] del info_stack[-num_children:] for ( cur_key, cur_task, cur_keys, cur_height, cur_width, cur_num_nodes, cur_fudge, cur_edges, ) in children_info: if cur_height == 1: num_single_nodes += 1 elif cur_height > height: height = cur_height width += cur_width num_nodes += cur_num_nodes fudge += cur_fudge if len(cur_edges) > max_num_edges: max_num_edges = len(cur_edges) children_edges |= cur_edges # Fudge factor to account for possible parallelism with the boundaries num_children_edges = len(children_edges) fudge += min( num_children - 1, max(0, num_children_edges - max_num_edges) ) if fudge > num_children_edges - 1 >= 0: fudge = num_children_edges - 1 edges |= children_edges no_new_edges = len(edges) == num_children_edges if not no_new_edges: fudge += 1 if ( (num_nodes + fudge) / height <= ave_width and num_single_nodes <= ave_width and width <= max_width and height <= max_height and # Sanity check; don't go too deep if new levels introduce new edge dependencies (no_new_edges or height < max_depth_new_edges) ): # Perform substitutions as we go val = dsk[parent] children_deps = set() for child_info in children_info: cur_child = child_info[0] val = subs(val, cur_child, child_info[1]) del rv[cur_child] children_deps |= deps_pop(cur_child) reducible_remove(cur_child) if rename_keys: fused_trees_pop(cur_child, None) child_keys.extend(child_info[2]) deps_parent -= children deps_parent |= children_deps if rename_keys: child_keys.append(parent) fused_trees[parent] = child_keys if children_stack: info_stack_append( ( parent, val, child_keys, height + 1, width, num_nodes + 1, fudge, edges, ) ) else: rv[parent] = val break else: for child_info in children_info: rv[child_info[0]] = child_info[1] reducible_remove(child_info[0]) if children_stack: # Allow the parent to be fused, but only under strict circumstances. # Ensure that linear chains may still be fused. if width > max_width: width = max_width if fudge > int(ave_width - 1): fudge = int(ave_width - 1) # key, task, height, width, number of nodes, fudge, set of edges # This task *implicitly* depends on `edges` info_stack_append( ( parent, rv[parent], [parent] if rename_keys else None, 1, width, 1, fudge, edges, ) ) else: break # Traverse upwards parent = rdeps[parent][0] if fuse_subgraphs: _inplace_fuse_subgraphs(rv, keys, deps, fused_trees, rename_keys) if key_renamer: for root_key, fused_keys in fused_trees.items(): alias = key_renamer(fused_keys) if alias is not None and alias not in rv: rv[alias] = rv[root_key] rv[root_key] = alias deps[alias] = deps[root_key] deps[root_key] = {alias} return rv, deps
def with_deps(dsk): return dsk, {k: get_dependencies(dsk, k) for k in dsk}
def start_state_from_dask(dsk, cache=None, sortkey=None): """Start state from a dask Examples -------- >>> inc = lambda x: x + 1 >>> add = lambda x, y: x + y >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')} # doctest: +SKIP >>> from pprint import pprint # doctest: +SKIP >>> pprint(start_state_from_dask(dsk)) # doctest: +SKIP {'cache': {'x': 1, 'y': 2}, 'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}}, 'dependents': defaultdict(None, {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}}), 'finished': set(), 'ready': ['z'], 'released': set(), 'running': set(), 'waiting': {'w': {'z'}}, 'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}} """ if sortkey is None: sortkey = order(dsk).get if cache is None: cache = config.get("cache", None) if cache is None: cache = dict() data_keys = set() for k, v in dsk.items(): if not has_tasks(dsk, v): cache[k] = v data_keys.add(k) dsk2 = dsk.copy() dsk2.update(cache) dependencies = {k: get_dependencies(dsk2, k) for k in dsk} waiting = { k: v.copy() for k, v in dependencies.items() if k not in data_keys } dependents = reverse_dict(dependencies) for a in cache: for b in dependents.get(a, ()): waiting[b].remove(a) waiting_data = {k: v.copy() for k, v in dependents.items() if v} ready_set = {k for k, v in waiting.items() if not v} ready = sorted(ready_set, key=sortkey, reverse=True) waiting = {k: v for k, v in waiting.items() if v} state = { "dependencies": dependencies, "dependents": dependents, "waiting": waiting, "waiting_data": waiting_data, "cache": cache, "ready": ready, "running": set(), "finished": set(), "released": set(), } return state
def update_state(dsk, dependencies, dependents, held_data, who_has, in_play, waiting, waiting_data, new_dsk, new_keys): """ Update state given new dask graph and output keys This should operate in linear time relative to the size of edges of the added graph. It assumes that the current runtime state is valid. """ dsk.update(new_dsk) if not isinstance(new_keys, set): new_keys = set(new_keys) for key in new_dsk: # add dependencies/dependents if key in dependencies: continue deps = get_dependencies(dsk, key) dependencies[key] = deps for dep in deps: if dep not in dependents: dependents[dep] = set() dependents[dep].add(key) if key not in dependents: dependents[key] = set() for key, value in new_dsk.items(): # add in remotedata vv, s = unpack_remotedata(value) if s: # TODO: check against in-memory, maybe add to in_play dsk[key] = vv dependencies[key] |= s for dep in s: if not dep in dependencies: held_data.add(dep) dependencies[dep] = set() if dep not in dependents: dependents[dep] = set() dependents[dep].add(key) exterior = keys_outside_frontier(dsk, dependencies, new_keys, in_play) in_play |= exterior for key in exterior: deps = dependencies[key] waiting[key] = {d for d in deps if not (d in who_has and who_has[d])} for dep in deps: if dep not in waiting_data: waiting_data[dep] = set() waiting_data[dep].add(key) if key not in waiting_data: waiting_data[key] = set() held_data |= new_keys return {'dsk': dsk, 'dependencies': dependencies, 'dependents': dependents, 'held_data': held_data, 'waiting': waiting, 'waiting_data': waiting_data}
def test_get_dependencies_list(): dsk = {"x": 1, "y": 2, "z": ["x", [(inc, "y")]]} assert get_dependencies(dsk, "z") == set(["x", "y"]) assert sorted(get_dependencies(dsk, "z", as_list=True)) == ["x", "y"]
def test_get_dependencies_empty(): dsk = {'x': (inc,)} assert get_dependencies(dsk, 'x') == set()