def metadata(self, path): path_info = PathInfo(os.path.abspath(path)) tree, dvc_tree = self._get_tree_pair(path_info) dvc_meta = None if dvc_tree: with suppress(OutputNotFoundError): dvc_meta = dvc_tree.metadata(path_info) stat_result = None with suppress(FileNotFoundError): stat_result = tree.stat(path_info) if not stat_result and not dvc_meta: raise FileNotFoundError meta = dvc_meta or Metadata(path_info=path_info) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir if not dvc_meta: meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode) return meta
def get(cls, url, src, out=None, version=None): if not out: out = os.path.basename(src) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(uuid.uuid4())) try: pkg = Pkg(tmp_dir, url=url, version=version) pkg.install() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. pkg.repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) src = os.path.join(pkg.path, urlparse(src).path.lstrip("/")) output, = pkg.repo.find_outs_by_path(src) pkg.repo.fetch(output.stage.path) output.path_info = PathInfo(os.path.abspath(out)) with output.repo.state: output.checkout() finally: shutil.rmtree(tmp_dir)
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = RepoFileSystem(dvc) subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert fs.info(subdir).get("md5") is None assert stage(dvc.odb.local, subdir, fs).hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert fs.info(subdir / "data").get("md5") is None assert stage(dvc.odb.local, subdir / "data", fs).hash_info == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) (tmp_dir / "dir" / "subdir" / "data").unlink() assert (fs.info(subdir / "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc")
def _get_dir_info_hash(self, dir_info): import tempfile from dvc.path_info import PathInfo from dvc.utils import tmp_fname # Sorting the list by path to ensure reproducibility if isinstance(dir_info, dict): dir_info = self._from_dict(dir_info) dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH)) tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.tree.path_info / tmp_fname("") self.tree.upload(from_info, to_info, no_progress_bar=True) hash_info = self.tree.get_file_hash(to_info) hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX hash_info.dir_info = self._to_dict(dir_info) return hash_info, to_info
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 path.unlink() path.write_text("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def test_state(dvc_repo, repo_dir): path = os.path.join(dvc_repo.root_dir, repo_dir.FOO) path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc_repo, dvc_repo.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 os.unlink(path) with open(path, "a") as fd: fd.write("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def test_set(tmp_dir, dvc, value): d = { "stages": { "build": { "set": { "item": value }, "cmd": "python script.py --thresh ${item}", "always_changed": "${item}", } } } resolver = DataResolver(dvc, PathInfo(str(tmp_dir)), d) assert_stage_equal( resolver.resolve(), { "stages": { "build": { "cmd": f"python script.py --thresh {value}", "always_changed": value, } } }, )
def _collect_graph(self, stages=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.stages stages = [stage for stage in stages if stage] outs = {} for stage in stages: for out in stage.outs: if out.path_info in outs: dup_stages = [stage, outs[out.path_info].stage] raise OutputDuplicationError(str(out), dup_stages) outs[out.path_info] = out for stage in stages: for out in stage.outs: for p in out.path_info.parents: if p in outs: raise OverlappingOutputPathsError(outs[p], out) for stage in stages: stage_path_info = PathInfo(stage.path) for p in chain([stage_path_info], stage_path_info.parents): if p in outs: raise StagePathAsOutputError(stage, str(outs[p])) for stage in stages: G.add_node(stage) for dep in stage.deps: if dep.path_info is None: continue for out_path_info, out in outs.items(): if out_path_info.overlaps(dep.path_info): G.add_node(out.stage) G.add_edge(stage, out.stage) check_acyclic(G) return G
def walk_files(self, top, **kwargs): for root, _, files in self.walk(top, **kwargs): for fname in files: yield PathInfo(root) / fname
def cache_dir(self, value): self.path_info = PathInfo(value) if value else None
def path_info(self): return PathInfo(self.tmp_dir.name)
def test_get_inode(tmp_dir): tmp_dir.gen("foo", "foo content") assert get_inode("foo") == get_inode(PathInfo("foo"))
def test_stage_fname(add): out = mock.Mock() out.is_in_repo = False out.path_info = PathInfo("path/to/out.txt") fname = Stage._stage_fname([out], add) assert fname == "out.txt.dvc"
def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): for fname in files: yield PathInfo(root) / fname
for root, dirs, files in tree.walk("dir"): for entry in dirs + files: actual.append(os.path.join(root, entry)) assert set(actual) == set(expected) assert len(actual) == len(expected) @pytest.mark.parametrize( "fetch,expected", [ (False, []), ( True, [ PathInfo("dir") / "subdir1", PathInfo("dir") / "subdir2", PathInfo("dir") / "subdir1" / "foo1", PathInfo("dir") / "subdir1" / "bar1", PathInfo("dir") / "subdir2" / "foo2", PathInfo("dir") / "foo", PathInfo("dir") / "bar", ], ), ], ) def test_walk_dir(tmp_dir, dvc, fetch, expected): tmp_dir.gen({ "dir": { "subdir1": { "foo1": "foo1",
def test_path_info_as_posix(mocker, path, as_posix, osname): mocker.patch("os.name", osname) assert PathInfo(path).as_posix() == as_posix
def test_get_inode(repo_dir): path = repo_dir.FOO path_info = PathInfo(path) assert get_inode(path) == get_inode(path_info)
def _resolve_params(self, context: Context, wdir): tracked = defaultdict(set) for src, keys in context.tracked.items(): tracked[str(PathInfo(src).relative_to(wdir))].update(keys) return [{file: list(keys)} for file, keys in tracked.items()]
def graph(self, stages=None, from_directory=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, use the ones on the `from_directory`. from_directory (str): directory where to look at for stages, if None is given, use the current working directory Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path_info == out.path_info: existing.append(o.stage) in_o_dir = out.path_info.isin(o.path_info) in_out_dir = o.path_info.isin(out.path_info) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(str(out), stages) outs.append(out) for stage in stages: stage_path_info = PathInfo(stage.path) for out in outs: if stage_path_info.isin(out.path_info): raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path_info != dep.path_info and not dep.path_info.isin(out.path_info) and not out.path_info.isin(dep.path_info)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
dvc.add(str(tmp_dir / "dir" / "foo")) tree = RepoTree(dvc) assert tree.isdir("dir") assert not tree.isfile("dir") @pytest.mark.parametrize( "dvcfiles,extra_expected", [ (False, []), ( True, [ PathInfo("dir") / "subdir1" / "foo1.dvc", PathInfo("dir") / "subdir1" / "bar1.dvc", PathInfo("dir") / "subdir2" / "foo2.dvc", ], ), ], ) def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): tmp_dir.gen({ "dir": { "subdir1": { "foo1": "foo1", "bar1": "bar1" }, "subdir2": { "foo2": "foo2"
def test_path_isin_accepts_pathinfo(): child = os.path.join("path", "to", "folder") parent = PathInfo(child) / ".." assert path_isin(child, parent) assert not path_isin(parent, child)
def test_subrepo_walk(tmp_dir, scm, dvc, dvcfiles, extra_expected): tmp_dir.scm_gen( {"dir": { "repo.txt": "file to confuse RepoTree" }}, commit="dir/repo.txt", ) subrepo1 = tmp_dir / "dir" / "repo" subrepo2 = tmp_dir / "dir" / "repo2" subdirs = [subrepo1, subrepo2] for dir_ in subdirs: make_subrepo(dir_, scm) subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") subrepo2.dvc_gen({ "lorem": "lorem", "dir2": { "ipsum": "ipsum" } }, commit="BAR") # using tree that does not have dvcignore dvc.tree._reset() tree = RepoTree(dvc, subrepos=True, fetch=True) expected = [ PathInfo("dir") / "repo", PathInfo("dir") / "repo.txt", PathInfo("dir") / "repo2", PathInfo("dir") / "repo" / ".dvcignore", PathInfo("dir") / "repo" / ".gitignore", PathInfo("dir") / "repo" / "foo", PathInfo("dir") / "repo" / "dir1", PathInfo("dir") / "repo" / "dir1" / "bar", PathInfo("dir") / "repo2" / ".dvcignore", PathInfo("dir") / "repo2" / ".gitignore", PathInfo("dir") / "repo2" / "lorem", PathInfo("dir") / "repo2" / "dir2", PathInfo("dir") / "repo2" / "dir2" / "ipsum", ] actual = [] for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles): for entry in dirs + files: actual.append(os.path.join(root, entry)) expected = [str(path) for path in expected + extra_expected] assert set(actual) == set(expected) assert len(actual) == len(expected)
def walk_files(self, path_info): assert is_working_tree(self.repo.tree) for fname in self.repo.tree.walk_files(path_info): yield PathInfo(fname)
def test_file_md5(repo_dir): fname = repo_dir.FOO fname_object = PathInfo(fname) assert file_md5(fname) == file_md5(fname_object)
def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G
def test_simple(tmp_dir, dvc): dump_yaml(tmp_dir / DEFAULT_PARAMS_FILE, CONTEXT_DATA) resolver = DataResolver( dvc, PathInfo(str(tmp_dir)), deepcopy(TEMPLATED_DVC_YAML_DATA) ) assert_stage_equal(resolver.resolve(), deepcopy(RESOLVED_DVC_YAML_DATA))
def unprotect(self, target): return self.cache.local.unprotect(PathInfo(target))
def test_no_params_yaml_and_vars(tmp_dir, dvc): resolver = DataResolver( dvc, PathInfo(str(tmp_dir)), deepcopy(TEMPLATED_DVC_YAML_DATA) ) with pytest.raises(ResolveError): resolver.resolve()
def walk_files(self, path_info, **kwargs): for root, _, files in self.walk(path_info): for file in files: # NOTE: os.path.join is ~5.5 times slower yield PathInfo(f"{root}{os.sep}{file}")
def walk_files(self, path_info): for fname in walk_files(path_info, self.repo.dvcignore): yield PathInfo(fname)