def test_walk_dirty(tmp_dir, dvc): tmp_dir.dvc_gen( { "dir": { "foo": "foo", "subdir1": {"foo1": "foo1", "bar1": "bar1"}, "subdir2": {"foo2": "foo2"}, } } ) tmp_dir.gen({"dir": {"bar": "bar", "subdir3": {"foo3": "foo3"}}}) (tmp_dir / "dir" / "foo").unlink() tree = RepoTree(dvc) expected = [ PathInfo("dir") / "subdir1", PathInfo("dir") / "subdir2", PathInfo("dir") / "subdir3", PathInfo("dir") / "subdir1" / "foo1", PathInfo("dir") / "subdir1" / "bar1", PathInfo("dir") / "subdir2" / "foo2", PathInfo("dir") / "subdir3" / "foo3", PathInfo("dir") / "bar", ] actual = [] for root, dirs, files in tree.walk("dir"): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [str(path) for path in expected] assert set(actual) == set(expected) assert len(actual) == len(expected)
def test_walk_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add( [ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ] ) tmp_dir.scm.commit("add dir") tree = RepoTree(dvc) expected = [ str(PathInfo("dir") / "foo"), str(PathInfo("dir") / "bar"), str(PathInfo("dir") / ".gitignore"), ] actual = [] for root, dirs, files in tree.walk("dir"): for entry in dirs + files: actual.append(os.path.join(root, entry)) assert set(actual) == set(expected) assert len(actual) == len(expected)
def _collect_plots(repo, targets=None, rev=None): plots = {out for stage in repo.stages for out in stage.outs if out.plot} def to_result(plots): return {plot.path_info: _plot_props(plot) for plot in plots} if not targets: return to_result(plots) target_infos = {PathInfo(os.path.abspath(target)) for target in targets} target_plots = set() for p in plots: if p.path_info in target_infos: target_plots.add(p) target_infos.remove(p.path_info) tree = RepoTree(repo) result = to_result(target_plots) for t in target_infos: if tree.isfile(t): result[t] = {} else: logger.warning( "'%s' was not found at: '%s'. It will not be plotted.", t, rev, ) return result
def _collect_paths( repo: Repo, targets: Iterable[str], recursive: bool = False, rev: str = None, ): path_infos = {PathInfo(os.path.abspath(target)) for target in targets} tree = RepoTree(repo) target_infos = set() for path_info in path_infos: if recursive and tree.isdir(path_info): target_infos.update(set(tree.walk_files(path_info))) if not tree.exists(path_info): if not recursive: logger.warning( "'%s' was not found at: '%s'.", path_info, rev, ) continue target_infos.add(path_info) return target_infos
def collect(self, targets=None, revs=None): """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ targets = [targets] if isinstance(targets, str) else targets or [] data = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" tree = RepoTree(self.repo) plots = _collect_plots(self.repo, targets, rev) for path_info, props in plots.items(): datafile = relpath(path_info, self.repo.root_dir) if rev not in data: data[rev] = {} data[rev].update({datafile: {"props": props}}) # Load data from git or dvc cache try: with tree.open(path_info) as fd: data[rev][datafile]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent pass return data
def test_exists(tmp_dir, dvc): tmp_dir.gen("foo", "foo") dvc.add("foo") (tmp_dir / "foo").unlink() tree = RepoTree(dvc) assert tree.exists("foo")
def repo_tree(temp_repo): fs_structure = { "models": { # mixed dvc + git directory "train.py": "train dot py", "test.py": "test dot py", }, "README.md": "my little project", # file "src": { # repo-only directory "utils": { "__init__.py": "", "serve_model.py": "# this will serve a model `soon`", } }, } dvc_structure = { "data": { # dvc only directory "raw": { "raw-1.csv": "one, dot, csv", "raw-2.csv": "two, dot, csv", }, "processed": { "processed-1.csv": "1, dot, csv", "processed-2.csv": "2, dot, csv", }, }, "models/transform.pickle": "model model", # file } temp_repo.scm_gen(fs_structure, commit="repo init") temp_repo.dvc_gen(dvc_structure, commit="use dvc") yield RepoTree(temp_repo.dvc, fetch=True)
def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): tmp_dir.gen({ "dir": { "subdir1": { "foo1": "foo1", "bar1": "bar1" }, "subdir2": { "foo2": "foo2" }, } }) dvc.add(str(tmp_dir / "dir"), recursive=True) tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tree = RepoTree(dvc) expected = [ PathInfo("dir") / "subdir1", PathInfo("dir") / "subdir2", PathInfo("dir") / "subdir1" / "foo1", PathInfo("dir") / "subdir1" / "bar1", PathInfo("dir") / "subdir2" / "foo2", PathInfo("dir") / "foo", PathInfo("dir") / "bar", ] actual = [] for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [str(path) for path in expected + extra_expected] assert set(actual) == set(expected) assert len(actual) == len(expected)
def test_exists_isdir_isfile_dirty(tmp_dir, dvc): tmp_dir.dvc_gen( {"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}} ) tree = RepoTree(dvc) shutil.rmtree(tmp_dir / "datadir") (tmp_dir / "datafile").unlink() root = PathInfo(tmp_dir) assert tree.exists(root / "datafile") assert tree.exists(root / "datadir") assert tree.exists(root / "datadir" / "foo") assert tree.isfile(root / "datafile") assert not tree.isfile(root / "datadir") assert tree.isfile(root / "datadir" / "foo") assert not tree.isdir(root / "datafile") assert tree.isdir(root / "datadir") assert not tree.isdir(root / "datadir" / "foo") # NOTE: creating file instead of dir and dir instead of file tmp_dir.gen({"datadir": "data", "datafile": {"foo": "foo", "bar": "bar"}}) assert tree.exists(root / "datafile") assert tree.exists(root / "datadir") assert not tree.exists(root / "datadir" / "foo") assert tree.exists(root / "datafile" / "foo") assert not tree.isfile(root / "datafile") assert tree.isfile(root / "datadir") assert not tree.isfile(root / "datadir" / "foo") assert tree.isfile(root / "datafile" / "foo") assert tree.isdir(root / "datafile") assert not tree.isdir(root / "datadir") assert not tree.isdir(root / "datadir" / "foo") assert not tree.isdir(root / "datafile" / "foo")
def _output_paths(repo): repo_tree = RepoTree(repo, stream=True) on_working_tree = isinstance(repo.tree, LocalTree) def _exists(output): if on_working_tree: return output.exists return True def _to_path(output): return ( str(output) if not output.is_dir_checksum else os.path.join(str(output), "") ) def _to_checksum(output): if on_working_tree: return repo.cache.local.tree.get_hash(output.path_info).value return output.hash_info.value for stage in repo.stages: for output in stage.outs: if _exists(output): yield _to_path(output), _to_checksum(output) if output.is_dir_checksum: yield from _dir_output_paths(repo_tree, output)
def _collect_metrics(repo, targets, recursive): if targets: target_infos = [ PathInfo(os.path.abspath(target)) for target in targets ] tree = RepoTree(repo) rec_files = [] if recursive: for target_info in target_infos: if tree.isdir(target_info): rec_files.extend(list(tree.walk_files(target_info))) result = [t for t in target_infos if tree.isfile(t)] result.extend(rec_files) return result metrics = set() for stage in repo.stages: for out in stage.outs: if not out.metric: continue metrics.add(out.path_info) return list(metrics)
def _output_paths(repo, targets): repo_tree = RepoTree(repo, stream=True) on_working_tree = isinstance(repo.tree, LocalTree) def _exists(output): if on_working_tree: return output.exists return True def _to_path(output): return (str(output) if not output.is_dir_checksum else os.path.join( str(output), "")) def _to_checksum(output): if on_working_tree: return repo.cache.local.tree.get_hash(output.path_info).value return output.hash_info.value for stage in repo.stages: for output in stage.outs: if _exists(output): yield_output = targets is None or any( output.path_info.isin_or_eq(target) for target in targets) if yield_output: yield _to_path(output), _to_checksum(output) if output.is_dir_checksum and (yield_output or any( target.isin(output.path_info) for target in targets)): yield from _dir_output_paths(repo_tree, output, targets)
def test_open_dirty_hash(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") tree = RepoTree(dvc) with tree.open("file", "r") as fobj: assert fobj.read() == "something"
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) tree = RepoTree(dvc) get_file_hash_spy = mocker.spy(tree, "get_file_hash") dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert get_file_hash_spy.called assert not dvc_tree_spy.called get_file_hash_spy.reset_mock() shutil.rmtree(tmp_dir / "dir") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert not get_file_hash_spy.called assert dvc_tree_spy.called
def test_open_dirty_no_hash(tmp_dir, dvc): tmp_dir.gen("file", "file") (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n") tree = RepoTree(dvc) with tree.open("file", "r") as fobj: assert fobj.read() == "file"
def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): with erepo_dir.chdir(): erepo_dir.gen({"dir": {"subdir": {"foo": "foo"}, "bar": "bar"}}) erepo_dir.dvc_add("dir/subdir", commit="subdir") erepo_dir.scm_add("dir", commit="dir") erepo_dir.add_remote(config=local_cloud.config) erepo_dir.dvc.push() # test only cares that either fetch or stream are set so that DVC dirs are # walked. # # for this test, all file objects are being opened() and copied from tree # into dvc.cache, not fetched or streamed from a remote tree = RepoTree(erepo_dir.dvc, stream=True) expected = [ tree.get_file_hash(PathInfo(erepo_dir / path)).value for path in ("dir/bar", "dir/subdir/foo") ] cache = dvc.cache.local path_info = PathInfo(erepo_dir / "dir") hash_info = cache.tree.get_hash(path_info) cache.save(path_info, tree, hash_info) for hash_ in expected: assert os.path.exists(cache.tree.hash_to_path_info(hash_))
def _collect_paths( repo: "Repo", targets: Iterable[str], recursive: bool = False, rev: str = None, ): from dvc.tree.repo import RepoTree path_infos = [PathInfo(os.path.abspath(target)) for target in targets] tree = RepoTree(repo) target_infos = [] for path_info in path_infos: if recursive and tree.isdir(path_info): target_infos.extend(tree.walk_files(path_info)) if not tree.exists(path_info): if not recursive: if rev == "workspace" or rev == "": logger.warning( "'%s' was not found in current workspace.", path_info, ) else: logger.warning( "'%s' was not found at: '%s'.", path_info, rev, ) continue target_infos.append(path_info) return target_infos
def _filter_missing(repo, paths): repo_tree = RepoTree(repo, stream=True) for path in paths: metadata = repo_tree.metadata(path) if metadata.is_dvc: out = metadata.outs[0] if out.status()[str(out)] == "not in cache": yield path
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") tree = RepoTree(dvc) actual = tree.get_hash(PathInfo(tmp_dir) / "file") expected = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") assert actual == expected
def test_open(tmp_dir, dvc): tmp_dir.gen("foo", "foo") dvc.add("foo") (tmp_dir / "foo").unlink() tree = RepoTree(dvc) with tree.open("foo", "r") as fobj: assert fobj.read() == "foo"
def test_isdir_mixed(tmp_dir, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) dvc.add(str(tmp_dir / "dir" / "foo")) tree = RepoTree(dvc) assert tree.isdir("dir") assert not tree.isfile("dir")
def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) tree = RepoTree(dvc) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo( "md5", "acbd18db4cc2f85cedef654fccc4a4d8", ) assert dvc_tree_spy.called
def _get_tree_for(self, repo, **kwargs): """ Provides a combined tree of a single repo with dvc + git/local tree. """ kw = {**self.tree_confs, **kwargs} if "fetch" not in kw: kw["fetch"] = True return RepoTree(repo, **kw)
def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" tree = RepoTree(self) try: with tree.open(path, mode=mode, encoding=encoding, **kwargs) as fobj: yield fobj except FileNotFoundError: raise PathMissingError(path, self.url)
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") tree = RepoTree(dvc) actual = tree.get_hash(PathInfo(tmp_dir) / "dir") expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert actual == expected assert actual.dir_info.nfiles == 3
def repo_tree(self): from dvc.tree.repo import RepoTree return RepoTree( self, subrepos=self.subrepos, repo_factory=self.make_repo, **self.tree_confs, )
def test_isdvc(tmp_dir, dvc): tmp_dir.gen({"foo": "foo", "bar": "bar", "dir": {"baz": "baz"}}) dvc.add("foo") dvc.add("dir") tree = RepoTree(dvc) assert tree.isdvc("foo") assert not tree.isdvc("bar") assert tree.isdvc("dir") assert not tree.isdvc("dir/baz") assert tree.isdvc("dir/baz", recursive=True)
def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, ) -> Dict[str, Dict]: """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ from dvc.tree.repo import RepoTree targets = [targets] if isinstance(targets, str) else targets or [] data = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" tree = RepoTree(self.repo) plots = _collect_plots(self.repo, targets, rev, recursive) for path_info, props in plots.items(): if rev not in data: data[rev] = {} if tree.isdir(path_info): plot_files = [] for pi in tree.walk_files(path_info): plot_files.append( (pi, relpath(pi, self.repo.root_dir)) ) else: plot_files = [ (path_info, relpath(path_info, self.repo.root_dir)) ] for path, repo_path in plot_files: data[rev].update({repo_path: {"props": props}}) # Load data from git or dvc cache try: with tree.open(path) as fd: data[rev][repo_path]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent pass return data
def _filter_missing(repo, paths): repo_tree = RepoTree(repo, stream=True) for path in paths: try: metadata = repo_tree.metadata(path) if metadata.is_dvc: out = metadata.outs[0] if out.status().get(str(out)) == "not in cache": yield path except FileNotFoundError: pass
def _collect_input(self, executor: ExperimentExecutor): """Copy (upload) input from the experiments workspace to the executor tree. """ logger.debug("Collecting input for '%s'", executor.tmp_dir) repo_tree = RepoTree(self.exp_dvc) self._process( executor.tree, self.exp_dvc.tree, executor.collect_files(self.exp_dvc.tree, repo_tree), )