Exemple #1
0
def test_walk_dirty(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {
            "dir": {
                "foo": "foo",
                "subdir1": {"foo1": "foo1", "bar1": "bar1"},
                "subdir2": {"foo2": "foo2"},
            }
        }
    )
    tmp_dir.gen({"dir": {"bar": "bar", "subdir3": {"foo3": "foo3"}}})
    (tmp_dir / "dir" / "foo").unlink()

    tree = RepoTree(dvc)
    expected = [
        PathInfo("dir") / "subdir1",
        PathInfo("dir") / "subdir2",
        PathInfo("dir") / "subdir3",
        PathInfo("dir") / "subdir1" / "foo1",
        PathInfo("dir") / "subdir1" / "bar1",
        PathInfo("dir") / "subdir2" / "foo2",
        PathInfo("dir") / "subdir3" / "foo3",
        PathInfo("dir") / "bar",
    ]

    actual = []
    for root, dirs, files in tree.walk("dir"):
        for entry in dirs + files:
            actual.append(os.path.join(root, entry))

    expected = [str(path) for path in expected]
    assert set(actual) == set(expected)
    assert len(actual) == len(expected)
Exemple #2
0
def test_walk_mixed_dir(tmp_dir, scm, dvc):
    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    tmp_dir.dvc.add(os.path.join("dir", "foo"))
    tmp_dir.scm.add(
        [
            os.path.join("dir", "bar"),
            os.path.join("dir", ".gitignore"),
            os.path.join("dir", "foo.dvc"),
        ]
    )
    tmp_dir.scm.commit("add dir")

    tree = RepoTree(dvc)

    expected = [
        str(PathInfo("dir") / "foo"),
        str(PathInfo("dir") / "bar"),
        str(PathInfo("dir") / ".gitignore"),
    ]
    actual = []
    for root, dirs, files in tree.walk("dir"):
        for entry in dirs + files:
            actual.append(os.path.join(root, entry))

    assert set(actual) == set(expected)
    assert len(actual) == len(expected)
Exemple #3
0
def _collect_plots(repo, targets=None, rev=None):
    plots = {out for stage in repo.stages for out in stage.outs if out.plot}

    def to_result(plots):
        return {plot.path_info: _plot_props(plot) for plot in plots}

    if not targets:
        return to_result(plots)

    target_infos = {PathInfo(os.path.abspath(target)) for target in targets}

    target_plots = set()
    for p in plots:
        if p.path_info in target_infos:
            target_plots.add(p)
            target_infos.remove(p.path_info)

    tree = RepoTree(repo)
    result = to_result(target_plots)

    for t in target_infos:
        if tree.isfile(t):
            result[t] = {}
        else:
            logger.warning(
                "'%s' was not found at: '%s'. It will not be plotted.",
                t,
                rev,
            )

    return result
Exemple #4
0
def _collect_paths(
    repo: Repo,
    targets: Iterable[str],
    recursive: bool = False,
    rev: str = None,
):
    path_infos = {PathInfo(os.path.abspath(target)) for target in targets}
    tree = RepoTree(repo)

    target_infos = set()
    for path_info in path_infos:

        if recursive and tree.isdir(path_info):
            target_infos.update(set(tree.walk_files(path_info)))

        if not tree.exists(path_info):
            if not recursive:
                logger.warning(
                    "'%s' was not found at: '%s'.",
                    path_info,
                    rev,
                )
            continue
        target_infos.add(path_info)
    return target_infos
Exemple #5
0
    def collect(self, targets=None, revs=None):
        """Collects all props and data for plots.

        Returns a structure like:
            {rev: {plots.csv: {
                props: {x: ..., "header": ..., ...},
                data: "...data as a string...",
            }}}
        Data parsing is postponed, since it's affected by props.
        """
        targets = [targets] if isinstance(targets, str) else targets or []
        data = {}
        for rev in self.repo.brancher(revs=revs):
            # .brancher() adds unwanted workspace
            if revs is not None and rev not in revs:
                continue
            rev = rev or "workspace"

            tree = RepoTree(self.repo)
            plots = _collect_plots(self.repo, targets, rev)
            for path_info, props in plots.items():
                datafile = relpath(path_info, self.repo.root_dir)
                if rev not in data:
                    data[rev] = {}
                data[rev].update({datafile: {"props": props}})

                # Load data from git or dvc cache
                try:
                    with tree.open(path_info) as fd:
                        data[rev][datafile]["data"] = fd.read()
                except FileNotFoundError:
                    # This might happen simply because cache is absent
                    pass

        return data
Exemple #6
0
def test_exists(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.add("foo")
    (tmp_dir / "foo").unlink()

    tree = RepoTree(dvc)
    assert tree.exists("foo")
Exemple #7
0
def repo_tree(temp_repo):
    fs_structure = {
        "models": {  # mixed dvc + git directory
            "train.py": "train dot py",
            "test.py": "test dot py",
        },
        "README.md": "my little project",  # file
        "src": {  # repo-only directory
            "utils": {
                "__init__.py": "",
                "serve_model.py": "# this will serve a model `soon`",
            }
        },
    }
    dvc_structure = {
        "data": {  # dvc only directory
            "raw": {
                "raw-1.csv": "one, dot, csv",
                "raw-2.csv": "two, dot, csv",
            },
            "processed": {
                "processed-1.csv": "1, dot, csv",
                "processed-2.csv": "2, dot, csv",
            },
        },
        "models/transform.pickle": "model model",  # file
    }

    temp_repo.scm_gen(fs_structure, commit="repo init")
    temp_repo.dvc_gen(dvc_structure, commit="use dvc")

    yield RepoTree(temp_repo.dvc, fetch=True)
Exemple #8
0
def test_walk(tmp_dir, dvc, dvcfiles, extra_expected):
    tmp_dir.gen({
        "dir": {
            "subdir1": {
                "foo1": "foo1",
                "bar1": "bar1"
            },
            "subdir2": {
                "foo2": "foo2"
            },
        }
    })
    dvc.add(str(tmp_dir / "dir"), recursive=True)
    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    tree = RepoTree(dvc)

    expected = [
        PathInfo("dir") / "subdir1",
        PathInfo("dir") / "subdir2",
        PathInfo("dir") / "subdir1" / "foo1",
        PathInfo("dir") / "subdir1" / "bar1",
        PathInfo("dir") / "subdir2" / "foo2",
        PathInfo("dir") / "foo",
        PathInfo("dir") / "bar",
    ]

    actual = []
    for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles):
        for entry in dirs + files:
            actual.append(os.path.join(root, entry))

    expected = [str(path) for path in expected + extra_expected]
    assert set(actual) == set(expected)
    assert len(actual) == len(expected)
Exemple #9
0
def test_exists_isdir_isfile_dirty(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}}
    )

    tree = RepoTree(dvc)
    shutil.rmtree(tmp_dir / "datadir")
    (tmp_dir / "datafile").unlink()

    root = PathInfo(tmp_dir)
    assert tree.exists(root / "datafile")
    assert tree.exists(root / "datadir")
    assert tree.exists(root / "datadir" / "foo")
    assert tree.isfile(root / "datafile")
    assert not tree.isfile(root / "datadir")
    assert tree.isfile(root / "datadir" / "foo")
    assert not tree.isdir(root / "datafile")
    assert tree.isdir(root / "datadir")
    assert not tree.isdir(root / "datadir" / "foo")

    # NOTE: creating file instead of dir and dir instead of file
    tmp_dir.gen({"datadir": "data", "datafile": {"foo": "foo", "bar": "bar"}})
    assert tree.exists(root / "datafile")
    assert tree.exists(root / "datadir")
    assert not tree.exists(root / "datadir" / "foo")
    assert tree.exists(root / "datafile" / "foo")
    assert not tree.isfile(root / "datafile")
    assert tree.isfile(root / "datadir")
    assert not tree.isfile(root / "datadir" / "foo")
    assert tree.isfile(root / "datafile" / "foo")
    assert tree.isdir(root / "datafile")
    assert not tree.isdir(root / "datadir")
    assert not tree.isdir(root / "datadir" / "foo")
    assert not tree.isdir(root / "datafile" / "foo")
Exemple #10
0
def _output_paths(repo):
    repo_tree = RepoTree(repo, stream=True)
    on_working_tree = isinstance(repo.tree, LocalTree)

    def _exists(output):
        if on_working_tree:
            return output.exists
        return True

    def _to_path(output):
        return (
            str(output)
            if not output.is_dir_checksum
            else os.path.join(str(output), "")
        )

    def _to_checksum(output):
        if on_working_tree:
            return repo.cache.local.tree.get_hash(output.path_info).value
        return output.hash_info.value

    for stage in repo.stages:
        for output in stage.outs:
            if _exists(output):
                yield _to_path(output), _to_checksum(output)
                if output.is_dir_checksum:
                    yield from _dir_output_paths(repo_tree, output)
Exemple #11
0
def _collect_metrics(repo, targets, recursive):

    if targets:
        target_infos = [
            PathInfo(os.path.abspath(target)) for target in targets
        ]
        tree = RepoTree(repo)

        rec_files = []
        if recursive:
            for target_info in target_infos:
                if tree.isdir(target_info):
                    rec_files.extend(list(tree.walk_files(target_info)))

        result = [t for t in target_infos if tree.isfile(t)]
        result.extend(rec_files)

        return result

    metrics = set()
    for stage in repo.stages:
        for out in stage.outs:
            if not out.metric:
                continue
            metrics.add(out.path_info)
    return list(metrics)
Exemple #12
0
def _output_paths(repo, targets):
    repo_tree = RepoTree(repo, stream=True)
    on_working_tree = isinstance(repo.tree, LocalTree)

    def _exists(output):
        if on_working_tree:
            return output.exists
        return True

    def _to_path(output):
        return (str(output) if not output.is_dir_checksum else os.path.join(
            str(output), ""))

    def _to_checksum(output):
        if on_working_tree:
            return repo.cache.local.tree.get_hash(output.path_info).value
        return output.hash_info.value

    for stage in repo.stages:
        for output in stage.outs:
            if _exists(output):
                yield_output = targets is None or any(
                    output.path_info.isin_or_eq(target) for target in targets)

                if yield_output:
                    yield _to_path(output), _to_checksum(output)

                if output.is_dir_checksum and (yield_output or any(
                        target.isin(output.path_info) for target in targets)):
                    yield from _dir_output_paths(repo_tree, output, targets)
Exemple #13
0
def test_open_dirty_hash(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    tree = RepoTree(dvc)
    with tree.open("file", "r") as fobj:
        assert fobj.read() == "something"
Exemple #14
0
def test_get_hash_cached_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {
            "foo": "foo",
            "bar": "bar",
            "subdir": {
                "data": "data"
            }
        }})
    tree = RepoTree(dvc)
    get_file_hash_spy = mocker.spy(tree, "get_file_hash")
    dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash")
    with dvc.state:
        assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo(
            "md5",
            "8761c4e9acad696bee718615e23e22db.dir",
        )
    assert get_file_hash_spy.called
    assert not dvc_tree_spy.called
    get_file_hash_spy.reset_mock()

    shutil.rmtree(tmp_dir / "dir")
    with dvc.state:
        assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo(
            "md5",
            "8761c4e9acad696bee718615e23e22db.dir",
        )
    assert not get_file_hash_spy.called
    assert dvc_tree_spy.called
Exemple #15
0
def test_open_dirty_no_hash(tmp_dir, dvc):
    tmp_dir.gen("file", "file")
    (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n")

    tree = RepoTree(dvc)
    with tree.open("file", "r") as fobj:
        assert fobj.read() == "file"
Exemple #16
0
def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud):
    with erepo_dir.chdir():
        erepo_dir.gen({"dir": {"subdir": {"foo": "foo"}, "bar": "bar"}})
        erepo_dir.dvc_add("dir/subdir", commit="subdir")
        erepo_dir.scm_add("dir", commit="dir")
        erepo_dir.add_remote(config=local_cloud.config)
        erepo_dir.dvc.push()

    # test only cares that either fetch or stream are set so that DVC dirs are
    # walked.
    #
    # for this test, all file objects are being opened() and copied from tree
    # into dvc.cache, not fetched or streamed from a remote
    tree = RepoTree(erepo_dir.dvc, stream=True)
    expected = [
        tree.get_file_hash(PathInfo(erepo_dir / path)).value
        for path in ("dir/bar", "dir/subdir/foo")
    ]

    cache = dvc.cache.local
    path_info = PathInfo(erepo_dir / "dir")
    hash_info = cache.tree.get_hash(path_info)
    cache.save(path_info, tree, hash_info)

    for hash_ in expected:
        assert os.path.exists(cache.tree.hash_to_path_info(hash_))
Exemple #17
0
def _collect_paths(
    repo: "Repo",
    targets: Iterable[str],
    recursive: bool = False,
    rev: str = None,
):
    from dvc.tree.repo import RepoTree

    path_infos = [PathInfo(os.path.abspath(target)) for target in targets]
    tree = RepoTree(repo)

    target_infos = []
    for path_info in path_infos:

        if recursive and tree.isdir(path_info):
            target_infos.extend(tree.walk_files(path_info))

        if not tree.exists(path_info):
            if not recursive:
                if rev == "workspace" or rev == "":
                    logger.warning(
                        "'%s' was not found in current workspace.", path_info,
                    )
                else:
                    logger.warning(
                        "'%s' was not found at: '%s'.", path_info, rev,
                    )
            continue
        target_infos.append(path_info)
    return target_infos
Exemple #18
0
def _filter_missing(repo, paths):
    repo_tree = RepoTree(repo, stream=True)
    for path in paths:
        metadata = repo_tree.metadata(path)
        if metadata.is_dvc:
            out = metadata.outs[0]
            if out.status()[str(out)] == "not in cache":
                yield path
Exemple #19
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    tree = RepoTree(dvc)
    actual = tree.get_hash(PathInfo(tmp_dir) / "file")
    expected = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac")
    assert actual == expected
Exemple #20
0
def test_open(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.add("foo")
    (tmp_dir / "foo").unlink()

    tree = RepoTree(dvc)
    with tree.open("foo", "r") as fobj:
        assert fobj.read() == "foo"
Exemple #21
0
def test_isdir_mixed(tmp_dir, dvc):
    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})

    dvc.add(str(tmp_dir / "dir" / "foo"))

    tree = RepoTree(dvc)
    assert tree.isdir("dir")
    assert not tree.isfile("dir")
Exemple #22
0
def test_get_hash_cached_file(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen({"foo": "foo"})
    tree = RepoTree(dvc)
    dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash")
    assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo(
        "md5", "acbd18db4cc2f85cedef654fccc4a4d8",
    )
    assert dvc_tree_spy.called
Exemple #23
0
 def _get_tree_for(self, repo, **kwargs):
     """
     Provides a combined tree of a single repo with dvc + git/local tree.
     """
     kw = {**self.tree_confs, **kwargs}
     if "fetch" not in kw:
         kw["fetch"] = True
     return RepoTree(repo, **kw)
Exemple #24
0
 def open_by_relpath(self, path, mode="r", encoding=None, **kwargs):
     """Opens a specified resource as a file object."""
     tree = RepoTree(self)
     try:
         with tree.open(path, mode=mode, encoding=encoding,
                        **kwargs) as fobj:
             yield fobj
     except FileNotFoundError:
         raise PathMissingError(path, self.url)
Exemple #25
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    tree = RepoTree(dvc)
    actual = tree.get_hash(PathInfo(tmp_dir) / "dir")
    expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir")
    assert actual == expected
    assert actual.dir_info.nfiles == 3
Exemple #26
0
    def repo_tree(self):
        from dvc.tree.repo import RepoTree

        return RepoTree(
            self,
            subrepos=self.subrepos,
            repo_factory=self.make_repo,
            **self.tree_confs,
        )
Exemple #27
0
def test_isdvc(tmp_dir, dvc):
    tmp_dir.gen({"foo": "foo", "bar": "bar", "dir": {"baz": "baz"}})
    dvc.add("foo")
    dvc.add("dir")
    tree = RepoTree(dvc)
    assert tree.isdvc("foo")
    assert not tree.isdvc("bar")
    assert tree.isdvc("dir")
    assert not tree.isdvc("dir/baz")
    assert tree.isdvc("dir/baz", recursive=True)
Exemple #28
0
    def collect(
        self,
        targets: List[str] = None,
        revs: List[str] = None,
        recursive: bool = False,
    ) -> Dict[str, Dict]:
        """Collects all props and data for plots.

        Returns a structure like:
            {rev: {plots.csv: {
                props: {x: ..., "header": ..., ...},
                data: "...data as a string...",
            }}}
        Data parsing is postponed, since it's affected by props.
        """
        from dvc.tree.repo import RepoTree

        targets = [targets] if isinstance(targets, str) else targets or []
        data = {}
        for rev in self.repo.brancher(revs=revs):
            # .brancher() adds unwanted workspace
            if revs is not None and rev not in revs:
                continue
            rev = rev or "workspace"

            tree = RepoTree(self.repo)
            plots = _collect_plots(self.repo, targets, rev, recursive)
            for path_info, props in plots.items():

                if rev not in data:
                    data[rev] = {}

                if tree.isdir(path_info):
                    plot_files = []
                    for pi in tree.walk_files(path_info):
                        plot_files.append(
                            (pi, relpath(pi, self.repo.root_dir))
                        )
                else:
                    plot_files = [
                        (path_info, relpath(path_info, self.repo.root_dir))
                    ]

                for path, repo_path in plot_files:
                    data[rev].update({repo_path: {"props": props}})

                    # Load data from git or dvc cache
                    try:
                        with tree.open(path) as fd:
                            data[rev][repo_path]["data"] = fd.read()
                    except FileNotFoundError:
                        # This might happen simply because cache is absent
                        pass

        return data
Exemple #29
0
def _filter_missing(repo, paths):
    repo_tree = RepoTree(repo, stream=True)
    for path in paths:
        try:
            metadata = repo_tree.metadata(path)
            if metadata.is_dvc:
                out = metadata.outs[0]
                if out.status().get(str(out)) == "not in cache":
                    yield path
        except FileNotFoundError:
            pass
Exemple #30
0
 def _collect_input(self, executor: ExperimentExecutor):
     """Copy (upload) input from the experiments workspace to the executor
     tree.
     """
     logger.debug("Collecting input for '%s'", executor.tmp_dir)
     repo_tree = RepoTree(self.exp_dvc)
     self._process(
         executor.tree,
         self.exp_dvc.tree,
         executor.collect_files(self.exp_dvc.tree, repo_tree),
     )