Example #1
0
def _rayify_task(task, key, deps):
    """
    Rayifies the given task, submitting it as a Ray task to the Ray cluster.

    Args:
        task: A Dask graph value, being either a literal, dependency key, Dask
            task, or a list thereof.
        key: The Dask graph key for the given task.
        deps: The dependencies of this task.

    Returns:
        A literal, a Ray object reference representing a submitted task, or a
        list thereof.
    """
    if isinstance(task, list):
        # Recursively rayify this list. This will still bottom out at the first
        # actual task encountered, inlining any tasks in that task's arguments.
        return [_rayify_task(t, deps) for t in task]
    elif istask(task):
        # Unpacks and repacks Ray object references and submits the task to the
        # Ray cluster for execution.
        func, args = task[0], task[1:]
        # If the function's arguments contain nested object references, we must
        # unpack said object references into a flat set of arguments so that
        # Ray properly tracks the object dependencies between Ray tasks.
        object_refs, repack = unpack_object_refs(args, deps)
        # Submit the task using a wrapper function.
        return dask_task_wrapper.remote(func, repack, *object_refs)
    elif not ishashable(task):
        return task
    elif task in deps:
        return deps[task]
    else:
        return task
Example #2
0
 def _get_arg(a):
     # if arg contains layer index or callable objs, handle it
     if ishashable(a) and a in dsk.keys():
         while ishashable(a) and a in dsk.keys():
             a = dsk[a]
         return _execute_task(a)
     elif not isinstance(a, str) and hasattr(a, "__getitem__"):
         if istask(
                 a
         ):  # TODO:Handle `SubgraphCallable`, which may contains dsk in it
             return spawn(a[0], args=tuple(_get_arg(i) for i in a[1:]))
         elif isinstance(a, dict):
             return {k: _get_arg(v) for k, v in a.items()}
         elif isinstance(a, List) or isinstance(a, Tuple):
             return type(a)(_get_arg(i) for i in a)
     return a
Example #3
0
    def find_symbol(self, arg):
        """Find the symbol associated with a given function argument.

        If the argument is a known key for either an external value
        or a previously registered function call, the appropriate symbol
        (either `varN` or `retN`) for the value will be returned.

        If this argument is not a known key, return None
        """
        if ishashable(arg):
            # is this a Dask key that maps to an input variable?
            sym = self.var_key_to_sym.get(arg, None)
            if sym is not None:
                return sym

            # is this a Dask key that maps to a function?
            func_sym = self.func_key_to_sym.get(arg, None)
            if func_sym is not None:
                # the symbol is the return value from calling the function
                return self.func_sym_to_ret_sym[func_sym]
Example #4
0
def inline(dsk, keys=None, inline_constants=True, dependencies=None):
    """Return new dask with the given keys inlined with their values.

    Inlines all constants if ``inline_constants`` keyword is True. Note that
    the constant keys will remain in the graph, to remove them follow
    ``inline`` with ``cull``.

    Examples
    --------
    >>> def inc(x):
    ...     return x + 1

    >>> def add(x, y):
    ...     return x + y

    >>> d = {'x': 1, 'y': (inc, 'x'), 'z': (add, 'x', 'y')}
    >>> inline(d)       # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, 'y')}

    >>> inline(d, keys='y') # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 1), 'z': (<function add at ...>, 1, (<function inc at ...>, 1))}

    >>> inline(d, keys='y', inline_constants=False) # doctest: +ELLIPSIS
    {'x': 1, 'y': (<function inc at ...>, 'x'), 'z': (<function add at ...>, 'x', (<function inc at ...>, 'x'))}
    """
    if dependencies and isinstance(next(iter(dependencies.values())), list):
        dependencies = {k: set(v) for k, v in dependencies.items()}

    keys = _flat_set(keys)

    if dependencies is None:
        dependencies = {k: get_dependencies(dsk, k) for k in dsk}

    if inline_constants:
        keys.update(
            k
            for k, v in dsk.items()
            if (ishashable(v) and v in dsk) or (not dependencies[k] and not istask(v))
        )

    # Keys may depend on other keys, so determine replace order with toposort.
    # The values stored in `keysubs` do not include other keys.
    replaceorder = toposort(
        {k: dsk[k] for k in keys if k in dsk}, dependencies=dependencies
    )
    keysubs = {}
    for key in replaceorder:
        val = dsk[key]
        for dep in keys & dependencies[key]:
            if dep in keysubs:
                replace = keysubs[dep]
            else:
                replace = dsk[dep]
            val = subs(val, dep, replace)
        keysubs[key] = val

    # Make new dask with substitutions
    dsk2 = keysubs.copy()
    for key, val in dsk.items():
        if key not in dsk2:
            for item in keys & dependencies[key]:
                val = subs(val, item, keysubs[item])
            dsk2[key] = val
    return dsk2
Example #5
0
File: dot.py Project: m-rossi/dask
def _to_cytoscape_json(
    dsk,
    data_attributes=None,
    function_attributes=None,
    collapse_outputs=False,
    verbose=False,
    **kwargs,
):
    """
    Convert a dask graph to Cytoscape JSON:
    https://js.cytoscape.org/#notation/elements-json
    """
    nodes = []
    edges = []
    data = {"nodes": nodes, "edges": edges}

    data_attributes = data_attributes or {}
    function_attributes = function_attributes or {}

    seen = set()
    connected = set()

    for k, v in dsk.items():
        k_name = name(k)
        if istask(v):
            func_name = name((k, "function")) if not collapse_outputs else k_name
            if collapse_outputs or func_name not in seen:
                seen.add(func_name)
                attrs = function_attributes.get(k, {}).copy()
                nodes.append(
                    {
                        "data": {
                            "id": func_name,
                            "label": key_split(k),
                            "shape": "ellipse",
                            "color": "gray",
                            **attrs,
                        }
                    }
                )
            if not collapse_outputs:
                edges.append({"data": {"source": func_name, "target": k_name}})

                connected.add(func_name)
                connected.add(k_name)

            for dep in get_dependencies(dsk, k):
                dep_name = name(dep)
                if dep_name not in seen:
                    seen.add(dep_name)
                    attrs = data_attributes.get(dep, {}).copy()
                    nodes.append(
                        {
                            "data": {
                                "id": dep_name,
                                "label": box_label(dep, verbose),
                                "shape": "rectangle",
                                "color": "gray",
                                **attrs,
                            }
                        }
                    )
                edges.append(
                    {
                        "data": {
                            "source": dep_name,
                            "target": func_name,
                        }
                    }
                )
                connected.add(dep_name)
                connected.add(func_name)

        elif ishashable(v) and v in dsk:
            v_name = name(v)
            edges.append(
                {
                    "data": {
                        "source": v_name,
                        "target": k_name,
                    }
                }
            )
            connected.add(v_name)
            connected.add(k_name)

        if (not collapse_outputs or k_name in connected) and k_name not in seen:
            seen.add(k_name)
            attrs = data_attributes.get(k, {}).copy()
            nodes.append(
                {
                    "data": {
                        "id": k_name,
                        "label": box_label(k, verbose),
                        "shape": "rectangle",
                        "color": "gray",
                        **attrs,
                    }
                }
            )
    return data
Example #6
0
File: dot.py Project: m-rossi/dask
def to_graphviz(
    dsk,
    data_attributes=None,
    function_attributes=None,
    rankdir="BT",
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    collapse_outputs=False,
    verbose=False,
    **kwargs,
):
    graphviz = import_required(
        "graphviz",
        "Drawing dask graphs with the graphviz engine requires the `graphviz` "
        "python library and the `graphviz` system library.\n\n"
        "Please either conda or pip install as follows:\n\n"
        "  conda install python-graphviz     # either conda install\n"
        "  python -m pip install graphviz    # or pip install and follow installation instructions",
    )

    data_attributes = data_attributes or {}
    function_attributes = function_attributes or {}
    graph_attr = graph_attr or {}
    node_attr = node_attr or {}
    edge_attr = edge_attr or {}

    graph_attr["rankdir"] = rankdir
    node_attr["fontname"] = "helvetica"

    graph_attr.update(kwargs)
    g = graphviz.Digraph(
        graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr
    )

    seen = set()
    connected = set()

    for k, v in dsk.items():
        k_name = name(k)
        if istask(v):
            func_name = name((k, "function")) if not collapse_outputs else k_name
            if collapse_outputs or func_name not in seen:
                seen.add(func_name)
                attrs = function_attributes.get(k, {}).copy()
                attrs.setdefault("label", key_split(k))
                attrs.setdefault("shape", "circle")
                g.node(func_name, **attrs)
            if not collapse_outputs:
                g.edge(func_name, k_name)
                connected.add(func_name)
                connected.add(k_name)

            for dep in get_dependencies(dsk, k):
                dep_name = name(dep)
                if dep_name not in seen:
                    seen.add(dep_name)
                    attrs = data_attributes.get(dep, {}).copy()
                    attrs.setdefault("label", box_label(dep, verbose))
                    attrs.setdefault("shape", "box")
                    g.node(dep_name, **attrs)
                g.edge(dep_name, func_name)
                connected.add(dep_name)
                connected.add(func_name)

        elif ishashable(v) and v in dsk:
            v_name = name(v)
            g.edge(v_name, k_name)
            connected.add(v_name)
            connected.add(k_name)

        if (not collapse_outputs or k_name in connected) and k_name not in seen:
            seen.add(k_name)
            attrs = data_attributes.get(k, {}).copy()
            attrs.setdefault("label", box_label(k, verbose))
            attrs.setdefault("shape", "box")
            g.node(k_name, **attrs)
    return g
Example #7
0
def _rayify_task(
        task,
        key,
        deps,
        ray_presubmit_cbs,
        ray_postsubmit_cbs,
        ray_pretask_cbs,
        ray_posttask_cbs,
):
    """
    Rayifies the given task, submitting it as a Ray task to the Ray cluster.

    Args:
        task (tuple): A Dask graph value, being either a literal, dependency
            key, Dask task, or a list thereof.
        key (str): The Dask graph key for the given task.
        deps (dict): The dependencies of this task.
        ray_presubmit_cbs (callable): Pre-task submission callbacks.
        ray_postsubmit_cbs (callable): Post-task submission callbacks.
        ray_pretask_cbs (callable): Pre-task execution callbacks.
        ray_posttask_cbs (callable): Post-task execution callbacks.

    Returns:
        A literal, a Ray object reference representing a submitted task, or a
        list thereof.
    """
    if isinstance(task, list):
        # Recursively rayify this list. This will still bottom out at the first
        # actual task encountered, inlining any tasks in that task's arguments.
        return [
            _rayify_task(
                t,
                key,
                deps,
                ray_presubmit_cbs,
                ray_postsubmit_cbs,
                ray_pretask_cbs,
                ray_posttask_cbs,
            ) for t in task
        ]
    elif istask(task):
        # Unpacks and repacks Ray object references and submits the task to the
        # Ray cluster for execution.
        if ray_presubmit_cbs is not None:
            alternate_returns = [
                cb(task, key, deps) for cb in ray_presubmit_cbs
            ]
            for alternate_return in alternate_returns:
                # We don't submit a Ray task if a presubmit callback returns
                # a non-`None` value, instead we return said value.
                # NOTE: This returns the first non-None presubmit callback
                # return value.
                if alternate_return is not None:
                    return alternate_return

        func, args = task[0], task[1:]
        if func is multiple_return_get:
            return _execute_task(task, deps)
        # If the function's arguments contain nested object references, we must
        # unpack said object references into a flat set of arguments so that
        # Ray properly tracks the object dependencies between Ray tasks.
        arg_object_refs, repack = unpack_object_refs(args, deps)
        # Submit the task using a wrapper function.
        object_refs = dask_task_wrapper.options(
            name=f"dask:{key!s}",
            num_returns=(1 if not isinstance(func, MultipleReturnFunc) else
                         func.num_returns),
        ).remote(
            func,
            repack,
            key,
            ray_pretask_cbs,
            ray_posttask_cbs,
            *arg_object_refs,
        )

        if ray_postsubmit_cbs is not None:
            for cb in ray_postsubmit_cbs:
                cb(task, key, deps, object_refs)

        return object_refs
    elif not ishashable(task):
        return task
    elif task in deps:
        return deps[task]
    else:
        return task
Example #8
0
def _add_logging(dsk, ignore=None):
    """
    Add logging to a Dask graph.

    @param dsk: The Dask graph.

    @return: New Dask graph.
    """
    ctx = current_action()
    result = {}

    # Use topological sort to ensure Eliot actions are in logical order of
    # execution in Dask:
    keys = toposort(dsk)

    # Give each key a string name. Some keys are just aliases to other
    # keys, so make sure we have underlying key available. Later on might
    # want to shorten them as well.
    def simplify(k):
        if isinstance(k, str):
            return k
        return "-".join(str(o) for o in k)

    key_names = {}
    for key in keys:
        value = dsk[key]
        if not callable(value) and ishashable(value) and value in keys:
            # It's an alias for another key:
            key_names[key] = key_names[value]
        else:
            key_names[key] = simplify(key)

    # Values in the graph can be either:
    #
    # 1. A list of other values.
    # 2. A tuple, where first value might be a callable, aka a task.
    # 3. A literal of some sort.
    def maybe_wrap(key, value):
        if isinstance(value, list):
            return [maybe_wrap(key, v) for v in value]
        elif isinstance(value, tuple):
            func = value[0]
            args = value[1:]
            if not callable(func):
                # Not a callable, so nothing to wrap.
                return value
            wrapped_func = _RunWithEliotContext(
                task_id=str(ctx.serialize_task_id(), "utf-8"),
                func=func,
                key=key_names[key],
                dependencies=[
                    key_names[k] for k in get_dependencies(dsk, key)
                ],
            )
            return (wrapped_func, ) + args
        else:
            return value

    # Replace function with wrapper that logs appropriate Action; iterate in
    # topological order so action task levels are in reasonable order.
    for key in keys:
        result[key] = maybe_wrap(key, dsk[key])

    assert set(result.keys()) == set(dsk.keys())
    return result
Example #9
0
def to_graphviz(
    dsk,
    data_attributes=None,
    function_attributes=None,
    rankdir="BT",
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    collapse_outputs=False,
    verbose=False,
    **kwargs,
):
    data_attributes = data_attributes or {}
    function_attributes = function_attributes or {}
    graph_attr = graph_attr or {}
    node_attr = node_attr or {}
    edge_attr = edge_attr or {}

    graph_attr["rankdir"] = rankdir
    node_attr["fontname"] = "helvetica"

    graph_attr.update(kwargs)
    g = graphviz.Digraph(graph_attr=graph_attr,
                         node_attr=node_attr,
                         edge_attr=edge_attr)

    seen = set()
    connected = set()

    for k, v in dsk.items():
        k_name = name(k)
        if istask(v):
            func_name = name(
                (k, "function")) if not collapse_outputs else k_name
            if collapse_outputs or func_name not in seen:
                seen.add(func_name)
                attrs = function_attributes.get(k, {}).copy()
                attrs.setdefault("label", key_split(k))
                attrs.setdefault("shape", "circle")
                g.node(func_name, **attrs)
            if not collapse_outputs:
                g.edge(func_name, k_name)
                connected.add(func_name)
                connected.add(k_name)

            for dep in get_dependencies(dsk, k):
                dep_name = name(dep)
                if dep_name not in seen:
                    seen.add(dep_name)
                    attrs = data_attributes.get(dep, {}).copy()
                    attrs.setdefault("label", box_label(dep, verbose))
                    attrs.setdefault("shape", "box")
                    g.node(dep_name, **attrs)
                g.edge(dep_name, func_name)
                connected.add(dep_name)
                connected.add(func_name)

        elif ishashable(v) and v in dsk:
            v_name = name(v)
            g.edge(v_name, k_name)
            connected.add(v_name)
            connected.add(k_name)

        if (not collapse_outputs
                or k_name in connected) and k_name not in seen:
            seen.add(k_name)
            attrs = data_attributes.get(k, {}).copy()
            attrs.setdefault("label", box_label(k, verbose))
            attrs.setdefault("shape", "box")
            g.node(k_name, **attrs)
    return g