def test_walk_dirty(tmp_dir, dvc): tmp_dir.dvc_gen( { "dir": { "foo": "foo", "subdir1": {"foo1": "foo1", "bar1": "bar1"}, "subdir2": {"foo2": "foo2"}, } } ) tmp_dir.gen({"dir": {"bar": "bar", "subdir3": {"foo3": "foo3"}}}) (tmp_dir / "dir" / "foo").unlink() tree = RepoTree(dvc) expected = [ PathInfo("dir") / "subdir1", PathInfo("dir") / "subdir2", PathInfo("dir") / "subdir3", PathInfo("dir") / "subdir1" / "foo1", PathInfo("dir") / "subdir1" / "bar1", PathInfo("dir") / "subdir2" / "foo2", PathInfo("dir") / "subdir3" / "foo3", PathInfo("dir") / "bar", ] actual = [] for root, dirs, files in tree.walk("dir"): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [str(path) for path in expected] assert set(actual) == set(expected) assert len(actual) == len(expected)
def test_walk_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add( [ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ] ) tmp_dir.scm.commit("add dir") tree = RepoTree(dvc) expected = [ str(PathInfo("dir") / "foo"), str(PathInfo("dir") / "bar"), str(PathInfo("dir") / ".gitignore"), ] actual = [] for root, dirs, files in tree.walk("dir"): for entry in dirs + files: actual.append(os.path.join(root, entry)) assert set(actual) == set(expected) assert len(actual) == len(expected)
def _collect_plots(repo, targets=None, rev=None): plots = {out for stage in repo.stages for out in stage.outs if out.plot} def to_result(plots): return {plot.path_info: _plot_props(plot) for plot in plots} if not targets: return to_result(plots) target_infos = {PathInfo(os.path.abspath(target)) for target in targets} target_plots = set() for p in plots: if p.path_info in target_infos: target_plots.add(p) target_infos.remove(p.path_info) tree = RepoTree(repo) result = to_result(target_plots) for t in target_infos: if tree.isfile(t): result[t] = {} else: logger.warning( "'%s' was not found at: '%s'. It will not be plotted.", t, rev, ) return result
def test_open_dirty_no_hash(tmp_dir, dvc): tmp_dir.gen("file", "file") (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n") tree = RepoTree(dvc) with tree.open("file", "r") as fobj: assert fobj.read() == "file"
def test_exists(tmp_dir, dvc): tmp_dir.gen("foo", "foo") dvc.add("foo") (tmp_dir / "foo").unlink() tree = RepoTree(dvc) assert tree.exists("foo")
def _collect_paths( repo: Repo, targets: Iterable[str], recursive: bool = False, rev: str = None, ): path_infos = {PathInfo(os.path.abspath(target)) for target in targets} tree = RepoTree(repo) target_infos = set() for path_info in path_infos: if recursive and tree.isdir(path_info): target_infos.update(set(tree.walk_files(path_info))) if not tree.exists(path_info): if not recursive: logger.warning( "'%s' was not found at: '%s'.", path_info, rev, ) continue target_infos.add(path_info) return target_infos
def collect(self, targets=None, revs=None): """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ targets = [targets] if isinstance(targets, str) else targets or [] data = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" tree = RepoTree(self.repo) plots = _collect_plots(self.repo, targets, rev) for path_info, props in plots.items(): datafile = relpath(path_info, self.repo.root_dir) if rev not in data: data[rev] = {} data[rev].update({datafile: {"props": props}}) # Load data from git or dvc cache try: with tree.open(path_info) as fd: data[rev][datafile]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent pass return data
def _collect_paths( repo: "Repo", targets: Iterable[str], recursive: bool = False, rev: str = None, ): from dvc.tree.repo import RepoTree path_infos = [PathInfo(os.path.abspath(target)) for target in targets] tree = RepoTree(repo) target_infos = [] for path_info in path_infos: if recursive and tree.isdir(path_info): target_infos.extend(tree.walk_files(path_info)) if not tree.exists(path_info): if not recursive: if rev == "workspace" or rev == "": logger.warning( "'%s' was not found in current workspace.", path_info, ) else: logger.warning( "'%s' was not found at: '%s'.", path_info, rev, ) continue target_infos.append(path_info) return target_infos
def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): tmp_dir.gen({ "dir": { "subdir1": { "foo1": "foo1", "bar1": "bar1" }, "subdir2": { "foo2": "foo2" }, } }) dvc.add(str(tmp_dir / "dir"), recursive=True) tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tree = RepoTree(dvc) expected = [ PathInfo("dir") / "subdir1", PathInfo("dir") / "subdir2", PathInfo("dir") / "subdir1" / "foo1", PathInfo("dir") / "subdir1" / "bar1", PathInfo("dir") / "subdir2" / "foo2", PathInfo("dir") / "foo", PathInfo("dir") / "bar", ] actual = [] for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [str(path) for path in expected + extra_expected] assert set(actual) == set(expected) assert len(actual) == len(expected)
def _collect_metrics(repo, targets, recursive): if targets: target_infos = [ PathInfo(os.path.abspath(target)) for target in targets ] tree = RepoTree(repo) rec_files = [] if recursive: for target_info in target_infos: if tree.isdir(target_info): rec_files.extend(list(tree.walk_files(target_info))) result = [t for t in target_infos if tree.isfile(t)] result.extend(rec_files) return result metrics = set() for stage in repo.stages: for out in stage.outs: if not out.metric: continue metrics.add(out.path_info) return list(metrics)
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) tree = RepoTree(dvc) get_file_hash_spy = mocker.spy(tree, "get_file_hash") dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert get_file_hash_spy.called assert not dvc_tree_spy.called get_file_hash_spy.reset_mock() shutil.rmtree(tmp_dir / "dir") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert not get_file_hash_spy.called assert dvc_tree_spy.called
def test_open_dirty_hash(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") tree = RepoTree(dvc) with tree.open("file", "r") as fobj: assert fobj.read() == "something"
def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): with erepo_dir.chdir(): erepo_dir.gen({"dir": {"subdir": {"foo": "foo"}, "bar": "bar"}}) erepo_dir.dvc_add("dir/subdir", commit="subdir") erepo_dir.scm_add("dir", commit="dir") erepo_dir.add_remote(config=local_cloud.config) erepo_dir.dvc.push() # test only cares that either fetch or stream are set so that DVC dirs are # walked. # # for this test, all file objects are being opened() and copied from tree # into dvc.cache, not fetched or streamed from a remote tree = RepoTree(erepo_dir.dvc, stream=True) expected = [ tree.get_file_hash(PathInfo(erepo_dir / path)).value for path in ("dir/bar", "dir/subdir/foo") ] cache = dvc.cache.local path_info = PathInfo(erepo_dir / "dir") hash_info = cache.tree.get_hash(path_info) cache.save(path_info, tree, hash_info) for hash_ in expected: assert os.path.exists(cache.tree.hash_to_path_info(hash_))
def _filter_missing(repo, paths): repo_tree = RepoTree(repo, stream=True) for path in paths: metadata = repo_tree.metadata(path) if metadata.is_dvc: out = metadata.outs[0] if out.status()[str(out)] == "not in cache": yield path
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") tree = RepoTree(dvc) actual = tree.get_hash(PathInfo(tmp_dir) / "file") expected = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") assert actual == expected
def test_isdir_mixed(tmp_dir, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) dvc.add(str(tmp_dir / "dir" / "foo")) tree = RepoTree(dvc) assert tree.isdir("dir") assert not tree.isfile("dir")
def test_open(tmp_dir, dvc): tmp_dir.gen("foo", "foo") dvc.add("foo") (tmp_dir / "foo").unlink() tree = RepoTree(dvc) with tree.open("foo", "r") as fobj: assert fobj.read() == "foo"
def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) tree = RepoTree(dvc) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo( "md5", "acbd18db4cc2f85cedef654fccc4a4d8", ) assert dvc_tree_spy.called
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") tree = RepoTree(dvc) actual = tree.get_hash(PathInfo(tmp_dir) / "dir") expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert actual == expected assert actual.dir_info.nfiles == 3
def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" tree = RepoTree(self) try: with tree.open(path, mode=mode, encoding=encoding, **kwargs) as fobj: yield fobj except FileNotFoundError: raise PathMissingError(path, self.url)
def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, ) -> Dict[str, Dict]: """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ from dvc.tree.repo import RepoTree targets = [targets] if isinstance(targets, str) else targets or [] data = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" tree = RepoTree(self.repo) plots = _collect_plots(self.repo, targets, rev, recursive) for path_info, props in plots.items(): if rev not in data: data[rev] = {} if tree.isdir(path_info): plot_files = [] for pi in tree.walk_files(path_info): plot_files.append( (pi, relpath(pi, self.repo.root_dir)) ) else: plot_files = [ (path_info, relpath(path_info, self.repo.root_dir)) ] for path, repo_path in plot_files: data[rev].update({repo_path: {"props": props}}) # Load data from git or dvc cache try: with tree.open(path) as fd: data[rev][repo_path]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent pass return data
def _filter_missing(repo, paths): repo_tree = RepoTree(repo, stream=True) for path in paths: try: metadata = repo_tree.metadata(path) if metadata.is_dvc: out = metadata.outs[0] if out.status().get(str(out)) == "not in cache": yield path except FileNotFoundError: pass
def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected): tmp_dir.scm_gen( {"dir": { "repo.txt": "file to confuse RepoTree" }}, commit="dir/repo.txt", ) subrepo1 = tmp_dir / "dir" / "repo" subrepo2 = tmp_dir / "dir" / "repo2" subdirs = [subrepo1, subrepo2] for dir_ in subdirs: make_subrepo(dir_, scm) subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") subrepo2.dvc_gen({ "lorem": "lorem", "dir2": { "ipsum": "ipsum" } }, commit="BAR") # using tree that does not have dvcignore dvc.tree._reset() tree = RepoTree(dvc, subrepos=True, fetch=True) expected = [ PathInfo("dir") / "repo", PathInfo("dir") / "repo.txt", PathInfo("dir") / "repo2", PathInfo("dir") / "repo" / ".gitignore", PathInfo("dir") / "repo" / "foo", PathInfo("dir") / "repo" / "dir1", PathInfo("dir") / "repo" / "dir1" / "bar", PathInfo("dir") / "repo2" / ".gitignore", PathInfo("dir") / "repo2" / "lorem", PathInfo("dir") / "repo2" / "dir2", PathInfo("dir") / "repo2" / "dir2" / "ipsum", ] actual = [] for root, dirs, files in tree.walk(os.path.join(tree.root_dir, "dir"), dvcfiles=dvcfiles): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [ os.path.join(tree.root_dir, path) for path in expected + extra_expected ] assert set(actual) == set(expected) assert len(actual) == len(expected)
def _targets_to_path_infos(repo, targets): path_infos = [] missing = [] repo_tree = RepoTree(repo, stream=True) for target in targets: if repo_tree.exists(target): path_infos.append(repo_tree.metadata(target).path_info) else: missing.append(target) return path_infos, missing
def test_walk_nested_subrepos(tmp_dir, dvc, scm, traverse_subrepos): # generate a dvc and fs structure, with suffix based on repo's basename def fs_structure(suffix): return { f"foo-{suffix}": f"foo-{suffix}", f"dir-{suffix}": { f"bar-{suffix}": f"bar-{suffix}" }, } def dvc_structure(suffix): return { f"lorem-{suffix}": f"lorem-{suffix}", f"dvc-{suffix}": { f"ipsum-{suffix}": f"ipsum-{suffix}" }, } paths = ["subrepo1", "subrepo2", "subrepo1/subrepo3"] subrepos = [tmp_dir / path for path in paths] for repo_dir in subrepos: make_subrepo(repo_dir, scm) extras = {".dvcignore", ".gitignore"} # these files are always there expected = {} for repo_dir in subrepos + [tmp_dir]: base = os.path.basename(repo_dir) scm_files = fs_structure(base) dvc_files = dvc_structure(base) with repo_dir.chdir(): repo_dir.scm_gen(scm_files, commit=f"git add in {repo_dir}") repo_dir.dvc_gen(dvc_files, commit=f"dvc add in {repo_dir}") if traverse_subrepos or repo_dir == tmp_dir: expected[str(repo_dir)] = set(scm_files.keys() | dvc_files.keys() | extras) # files inside a dvc directory expected[str(repo_dir / f"dvc-{base}")] = {f"ipsum-{base}"} # files inside a git directory expected[str(repo_dir / f"dir-{base}")] = {f"bar-{base}"} if traverse_subrepos: # update subrepos expected[str(tmp_dir)].update(["subrepo1", "subrepo2"]) expected[str(tmp_dir / "subrepo1")].add("subrepo3") actual = {} tree = RepoTree(dvc, subrepos=traverse_subrepos) for root, dirs, files in tree.walk(str(tmp_dir)): actual[root] = set(dirs + files) assert expected == actual
def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): out = tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="init")[0].outs[0] dvc.push() remove(dvc.cache.local.cache_dir) tree = RepoTree(dvc, fetch=True) with dvc.state: for _, _, _ in tree.walk("dir"): pass assert os.path.exists(out.cache_path) for entry in out.dir_cache: hash_ = entry[out.tree.PARAM_CHECKSUM] assert os.path.exists(dvc.cache.local.tree.hash_to_path_info(hash_))
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) tree = RepoTree(dvc) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert tree.get_hash(subdir) == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert tree.get_hash(subdir / "data") == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) assert dvc_tree_spy.called
def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): out = tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="init")[0].outs[0] dvc.push() remove(dvc.cache.local.cache_dir) remove(tmp_dir / "dir") tree = RepoTree(dvc, fetch=True) for _, _, _ in tree.walk("dir"): pass assert os.path.exists(out.cache_path) for _, hi in out.dir_cache.items(): assert hi.name == out.tree.PARAM_CHECKSUM assert os.path.exists(dvc.cache.local.tree.hash_to_path_info(hi.value))
def test_get_hash_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add([ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ]) tmp_dir.scm.commit("add dir") tree = RepoTree(dvc) actual = tree.get_hash(PathInfo(tmp_dir) / "dir") expected = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") assert actual == expected
def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" tree = RepoTree(self, stream=True, subrepos=True) path = PathInfo(self.root_dir) / path try: with self.state: with tree.open( path, mode=mode, encoding=encoding, remote=remote, ) as fobj: yield fobj except FileNotFoundError as exc: raise FileMissingError(path) from exc except IsADirectoryError as exc: raise DvcIsADirectoryError(f"'{path}' is a directory") from exc