Ejemplo n.º 1
0
    def graph(self, stages=None, from_directory=None):
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = [o.stage for o in outs if o.path == out.path]

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(out.path, stages)

                outs.append(out)

        for stage in stages:
            path_dir = os.path.dirname(stage.path) + os.sep
            for out in outs:
                if path_dir.startswith(out.path + os.sep):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        # collect the whole DAG
        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (
                        out.path != dep.path
                        and not dep.path.startswith(out.path + out.sep)
                        and not out.path.startswith(dep.path + dep.sep)
                    ):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active
Ejemplo n.º 2
0
    def _collect_graph(self, stages=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        stages = [stage for stage in stages if stage]
        outs = {}

        for stage in stages:
            for out in stage.outs:
                if out.path_info in outs:
                    dup_stages = [stage, outs[out.path_info].stage]
                    raise OutputDuplicationError(str(out), dup_stages)
                outs[out.path_info] = out

        for stage in stages:
            for out in stage.outs:
                for p in out.path_info.parents:
                    if p in outs:
                        raise OverlappingOutputPathsError(outs[p], out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for p in chain([stage_path_info], stage_path_info.parents):
                if p in outs:
                    raise StagePathAsOutputError(stage, str(outs[p]))

        for stage in stages:
            G.add_node(stage)

            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                for out_path_info, out in outs.items():
                    if out_path_info.overlaps(dep.path_info):
                        G.add_node(out.stage)
                        G.add_edge(stage, out.stage)

        check_acyclic(G)

        return G
Ejemplo n.º 3
0
    def _collect_graph(self, stages):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie

        from dvc.exceptions import (
            OutputDuplicationError,
            OverlappingOutputPathsError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in filter(bool, stages):  # bug? not using it later
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                           "overlap. To avoid unpredictable behaviour, "
                           "rerun command with non overlapping outs paths."
                           ).format(
                               str(parent),
                               parent.stage.addressing,
                               str(overlapping),
                               overlapping.stage.addressing,
                           )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = [n.value for n in outs.prefixes(dep_key)]
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)
        check_acyclic(G)

        return G
Ejemplo n.º 4
0
    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = []
                for o in outs:
                    if o.path_info == out.path_info:
                        existing.append(o.stage)

                    in_o_dir = out.path_info.isin(o.path_info)
                    in_out_dir = o.path_info.isin(out.path_info)
                    if in_o_dir or in_out_dir:
                        raise OverlappingOutputPathsError(o, out)

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(str(out), stages)

                outs.append(out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for out in outs:
                if stage_path_info.isin(out.path_info):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path_info != dep.path_info
                            and not dep.path_info.isin(out.path_info)
                            and not out.path_info.isin(dep.path_info)):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active
Ejemplo n.º 5
0
def build_graph(stages, outs_trie=None):
    """Generate a graph by using the given stages on the given directory

    The nodes of the graph are the stage's path relative to the root.

    Edges are created when the output of one stage is used as a
    dependency in other stage.

    The direction of the edges goes from the stage to its dependency:

    For example, running the following:

        $ dvc run -o A "echo A > A"
        $ dvc run -d A -o B "echo B > B"
        $ dvc run -d B -o C "echo C > C"

    Will create the following graph:

           ancestors <--
                       |
            C.dvc -> B.dvc -> A.dvc
            |          |
            |          --> descendants
            |
            ------- pipeline ------>
                       |
                       v
          (weakly connected components)

    Args:
        stages (list): used to build a graph from

    Raises:
        OutputDuplicationError: two outputs with the same path
        StagePathAsOutputError: stage inside an output directory
        OverlappingOutputPathsError: output inside output directory
        CyclicGraphError: resulting graph has cycles
    """
    import networkx as nx

    from dvc.exceptions import StagePathAsOutputError

    from ..path_info import PathInfo
    from .trie import build_outs_trie

    G = nx.DiGraph()

    # Use trie to efficiently find overlapping outs and deps
    outs_trie = outs_trie or build_outs_trie(stages)

    for stage in stages:
        out = outs_trie.shortest_prefix(PathInfo(stage.path).parts).value
        if out:
            raise StagePathAsOutputError(stage, str(out))

    # Building graph
    G.add_nodes_from(stages)
    for stage in stages:
        for dep in stage.deps:
            if dep.path_info is None:
                continue

            dep_key = dep.path_info.parts
            overlapping = [n.value for n in outs_trie.prefixes(dep_key)]
            if outs_trie.has_subtrie(dep_key):
                overlapping.extend(outs_trie.values(prefix=dep_key))

            G.add_edges_from((stage, out.stage) for out in overlapping)
    check_acyclic(G)

    return G