Beispiel #1
0
    def metadata(self, path):
        path_info = PathInfo(os.path.abspath(path))
        tree, dvc_tree = self._get_tree_pair(path_info)

        dvc_meta = None
        if dvc_tree:
            with suppress(OutputNotFoundError):
                dvc_meta = dvc_tree.metadata(path_info)

        stat_result = None
        with suppress(FileNotFoundError):
            stat_result = tree.stat(path_info)

        if not stat_result and not dvc_meta:
            raise FileNotFoundError

        meta = dvc_meta or Metadata(path_info=path_info)

        isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode)
        meta.isdir = meta.isdir or isdir

        if not dvc_meta:
            meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode)
        return meta
Beispiel #2
0
    def get(cls, url, src, out=None, version=None):
        if not out:
            out = os.path.basename(src)

        # Creating a directory right beside the output to make sure that they
        # are on the same filesystem, so we could take the advantage of
        # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
        # because it will create a symlink to tmpfs, which defeats the purpose
        # and won't work with reflink/hardlink.
        dpath = os.path.dirname(os.path.abspath(out))
        tmp_dir = os.path.join(dpath, "." + str(uuid.uuid4()))
        try:
            pkg = Pkg(tmp_dir, url=url, version=version)
            pkg.install()
            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            pkg.repo.config.set(
                Config.SECTION_CACHE,
                Config.SECTION_CACHE_TYPE,
                "reflink,hardlink,copy",
            )
            src = os.path.join(pkg.path, urlparse(src).path.lstrip("/"))
            output, = pkg.repo.find_outs_by_path(src)
            pkg.repo.fetch(output.stage.path)
            output.path_info = PathInfo(os.path.abspath(out))
            with output.repo.state:
                output.checkout()
        finally:
            shutil.rmtree(tmp_dir)
Beispiel #3
0
def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {
            "foo": "foo",
            "bar": "bar",
            "subdir": {
                "data": "data"
            }
        }})
    fs = RepoFileSystem(dvc)
    subdir = PathInfo(tmp_dir) / "dir" / "subdir"
    assert fs.info(subdir).get("md5") is None
    assert stage(dvc.odb.local, subdir, fs).hash_info == HashInfo(
        "md5",
        "af314506f1622d107e0ed3f14ec1a3b5.dir",
    )
    assert fs.info(subdir / "data").get("md5") is None
    assert stage(dvc.odb.local, subdir / "data", fs).hash_info == HashInfo(
        "md5",
        "8d777f385d3dfec8815d20f7496026dc",
    )
    (tmp_dir / "dir" / "subdir" / "data").unlink()
    assert (fs.info(subdir /
                    "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc")
Beispiel #4
0
    def _get_dir_info_hash(self, dir_info):
        import tempfile

        from dvc.path_info import PathInfo
        from dvc.utils import tmp_fname

        # Sorting the list by path to ensure reproducibility
        if isinstance(dir_info, dict):
            dir_info = self._from_dict(dir_info)
        dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH))

        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        from_info = PathInfo(tmp)
        to_info = self.tree.path_info / tmp_fname("")
        self.tree.upload(from_info, to_info, no_progress_bar=True)

        hash_info = self.tree.get_file_hash(to_info)
        hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX
        hash_info.dir_info = self._to_dict(dir_info)

        return hash_info, to_info
Beispiel #5
0
def test_state(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")
    path = tmp_dir / "foo"
    path_info = PathInfo(path)
    md5 = file_md5(path)[0]

    state = State(dvc)

    with state:
        state.save(path_info, md5)
        entry_md5 = state.get(path_info)
        assert entry_md5 == md5

        path.unlink()
        path.write_text("1")

        entry_md5 = state.get(path_info)
        assert entry_md5 is None

        md5 = file_md5(path)[0]
        state.save(path_info, md5)

        entry_md5 = state.get(path_info)
        assert entry_md5 == md5
Beispiel #6
0
def test_state(dvc_repo, repo_dir):
    path = os.path.join(dvc_repo.root_dir, repo_dir.FOO)
    path_info = PathInfo(path)
    md5 = file_md5(path)[0]

    state = State(dvc_repo, dvc_repo.config.config)

    with state:
        state.save(path_info, md5)
        entry_md5 = state.get(path_info)
        assert entry_md5 == md5

        os.unlink(path)
        with open(path, "a") as fd:
            fd.write("1")

        entry_md5 = state.get(path_info)
        assert entry_md5 is None

        md5 = file_md5(path)[0]
        state.save(path_info, md5)

        entry_md5 = state.get(path_info)
        assert entry_md5 == md5
Beispiel #7
0
def test_set(tmp_dir, dvc, value):
    d = {
        "stages": {
            "build": {
                "set": {
                    "item": value
                },
                "cmd": "python script.py --thresh ${item}",
                "always_changed": "${item}",
            }
        }
    }
    resolver = DataResolver(dvc, PathInfo(str(tmp_dir)), d)
    assert_stage_equal(
        resolver.resolve(),
        {
            "stages": {
                "build": {
                    "cmd": f"python script.py --thresh {value}",
                    "always_changed": value,
                }
            }
        },
    )
Beispiel #8
0
    def _collect_graph(self, stages=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        stages = [stage for stage in stages if stage]
        outs = {}

        for stage in stages:
            for out in stage.outs:
                if out.path_info in outs:
                    dup_stages = [stage, outs[out.path_info].stage]
                    raise OutputDuplicationError(str(out), dup_stages)
                outs[out.path_info] = out

        for stage in stages:
            for out in stage.outs:
                for p in out.path_info.parents:
                    if p in outs:
                        raise OverlappingOutputPathsError(outs[p], out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for p in chain([stage_path_info], stage_path_info.parents):
                if p in outs:
                    raise StagePathAsOutputError(stage, str(outs[p]))

        for stage in stages:
            G.add_node(stage)

            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                for out_path_info, out in outs.items():
                    if out_path_info.overlaps(dep.path_info):
                        G.add_node(out.stage)
                        G.add_edge(stage, out.stage)

        check_acyclic(G)

        return G
Beispiel #9
0
 def walk_files(self, top, **kwargs):
     for root, _, files in self.walk(top, **kwargs):
         for fname in files:
             yield PathInfo(root) / fname
Beispiel #10
0
 def cache_dir(self, value):
     self.path_info = PathInfo(value) if value else None
Beispiel #11
0
 def path_info(self):
     return PathInfo(self.tmp_dir.name)
Beispiel #12
0
def test_get_inode(tmp_dir):
    tmp_dir.gen("foo", "foo content")

    assert get_inode("foo") == get_inode(PathInfo("foo"))
Beispiel #13
0
def test_stage_fname(add):
    out = mock.Mock()
    out.is_in_repo = False
    out.path_info = PathInfo("path/to/out.txt")
    fname = Stage._stage_fname([out], add)
    assert fname == "out.txt.dvc"
Beispiel #14
0
 def walk_files(self, top, **kwargs):  # pylint: disable=arguments-differ
     for root, _, files in self.walk(top, **kwargs):
         for fname in files:
             yield PathInfo(root) / fname
Beispiel #15
0
    for root, dirs, files in tree.walk("dir"):
        for entry in dirs + files:
            actual.append(os.path.join(root, entry))

    assert set(actual) == set(expected)
    assert len(actual) == len(expected)


@pytest.mark.parametrize(
    "fetch,expected",
    [
        (False, []),
        (
            True,
            [
                PathInfo("dir") / "subdir1",
                PathInfo("dir") / "subdir2",
                PathInfo("dir") / "subdir1" / "foo1",
                PathInfo("dir") / "subdir1" / "bar1",
                PathInfo("dir") / "subdir2" / "foo2",
                PathInfo("dir") / "foo",
                PathInfo("dir") / "bar",
            ],
        ),
    ],
)
def test_walk_dir(tmp_dir, dvc, fetch, expected):
    tmp_dir.gen({
        "dir": {
            "subdir1": {
                "foo1": "foo1",
Beispiel #16
0
def test_path_info_as_posix(mocker, path, as_posix, osname):
    mocker.patch("os.name", osname)
    assert PathInfo(path).as_posix() == as_posix
Beispiel #17
0
def test_get_inode(repo_dir):
    path = repo_dir.FOO
    path_info = PathInfo(path)
    assert get_inode(path) == get_inode(path_info)
Beispiel #18
0
    def _resolve_params(self, context: Context, wdir):
        tracked = defaultdict(set)
        for src, keys in context.tracked.items():
            tracked[str(PathInfo(src).relative_to(wdir))].update(keys)

        return [{file: list(keys)} for file, keys in tracked.items()]
Beispiel #19
0
    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = []
                for o in outs:
                    if o.path_info == out.path_info:
                        existing.append(o.stage)

                    in_o_dir = out.path_info.isin(o.path_info)
                    in_out_dir = o.path_info.isin(out.path_info)
                    if in_o_dir or in_out_dir:
                        raise OverlappingOutputPathsError(o, out)

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(str(out), stages)

                outs.append(out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for out in outs:
                if stage_path_info.isin(out.path_info):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path_info != dep.path_info
                            and not dep.path_info.isin(out.path_info)
                            and not out.path_info.isin(dep.path_info)):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active
Beispiel #20
0
    dvc.add(str(tmp_dir / "dir" / "foo"))

    tree = RepoTree(dvc)
    assert tree.isdir("dir")
    assert not tree.isfile("dir")


@pytest.mark.parametrize(
    "dvcfiles,extra_expected",
    [
        (False, []),
        (
            True,
            [
                PathInfo("dir") / "subdir1" / "foo1.dvc",
                PathInfo("dir") / "subdir1" / "bar1.dvc",
                PathInfo("dir") / "subdir2" / "foo2.dvc",
            ],
        ),
    ],
)
def test_walk(tmp_dir, dvc, dvcfiles, extra_expected):
    tmp_dir.gen({
        "dir": {
            "subdir1": {
                "foo1": "foo1",
                "bar1": "bar1"
            },
            "subdir2": {
                "foo2": "foo2"
Beispiel #21
0
def test_path_isin_accepts_pathinfo():
    child = os.path.join("path", "to", "folder")
    parent = PathInfo(child) / ".."

    assert path_isin(child, parent)
    assert not path_isin(parent, child)
Beispiel #22
0
def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected):
    tmp_dir.scm_gen(
        {"dir": {
            "repo.txt": "file to confuse RepoTree"
        }},
        commit="dir/repo.txt",
    )

    subrepo1 = tmp_dir / "dir" / "repo"
    subrepo2 = tmp_dir / "dir" / "repo2"

    subdirs = [subrepo1, subrepo2]
    for dir_ in subdirs:
        make_subrepo(dir_, scm)

    subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO")
    subrepo2.dvc_gen({
        "lorem": "lorem",
        "dir2": {
            "ipsum": "ipsum"
        }
    },
                     commit="BAR")

    # using tree that does not have dvcignore
    dvc.tree._reset()
    tree = RepoTree(dvc, subrepos=True, fetch=True)
    expected = [
        PathInfo("dir") / "repo",
        PathInfo("dir") / "repo.txt",
        PathInfo("dir") / "repo2",
        PathInfo("dir") / "repo" / ".dvcignore",
        PathInfo("dir") / "repo" / ".gitignore",
        PathInfo("dir") / "repo" / "foo",
        PathInfo("dir") / "repo" / "dir1",
        PathInfo("dir") / "repo" / "dir1" / "bar",
        PathInfo("dir") / "repo2" / ".dvcignore",
        PathInfo("dir") / "repo2" / ".gitignore",
        PathInfo("dir") / "repo2" / "lorem",
        PathInfo("dir") / "repo2" / "dir2",
        PathInfo("dir") / "repo2" / "dir2" / "ipsum",
    ]

    actual = []
    for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles):
        for entry in dirs + files:
            actual.append(os.path.join(root, entry))

    expected = [str(path) for path in expected + extra_expected]
    assert set(actual) == set(expected)
    assert len(actual) == len(expected)
Beispiel #23
0
    def walk_files(self, path_info):
        assert is_working_tree(self.repo.tree)

        for fname in self.repo.tree.walk_files(path_info):
            yield PathInfo(fname)
Beispiel #24
0
def test_file_md5(repo_dir):
    fname = repo_dir.FOO
    fname_object = PathInfo(fname)
    assert file_md5(fname) == file_md5(fname_object)
Beispiel #25
0
    def _collect_graph(self, stages):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie

        from dvc.exceptions import (
            OutputDuplicationError,
            OverlappingOutputPathsError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in filter(bool, stages):  # bug? not using it later
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                           "overlap. To avoid unpredictable behaviour, "
                           "rerun command with non overlapping outs paths."
                           ).format(
                               str(parent),
                               parent.stage.addressing,
                               str(overlapping),
                               overlapping.stage.addressing,
                           )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = [n.value for n in outs.prefixes(dep_key)]
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)
        check_acyclic(G)

        return G
def test_simple(tmp_dir, dvc):
    dump_yaml(tmp_dir / DEFAULT_PARAMS_FILE, CONTEXT_DATA)
    resolver = DataResolver(
        dvc, PathInfo(str(tmp_dir)), deepcopy(TEMPLATED_DVC_YAML_DATA)
    )
    assert_stage_equal(resolver.resolve(), deepcopy(RESOLVED_DVC_YAML_DATA))
Beispiel #27
0
 def unprotect(self, target):
     return self.cache.local.unprotect(PathInfo(target))
def test_no_params_yaml_and_vars(tmp_dir, dvc):
    resolver = DataResolver(
        dvc, PathInfo(str(tmp_dir)), deepcopy(TEMPLATED_DVC_YAML_DATA)
    )
    with pytest.raises(ResolveError):
        resolver.resolve()
Beispiel #29
0
 def walk_files(self, path_info, **kwargs):
     for root, _, files in self.walk(path_info):
         for file in files:
             # NOTE: os.path.join is ~5.5 times slower
             yield PathInfo(f"{root}{os.sep}{file}")
Beispiel #30
0
 def walk_files(self, path_info):
     for fname in walk_files(path_info, self.repo.dvcignore):
         yield PathInfo(fname)