def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = WorkingTree(self.root_dir) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def test_nobranch(self): tree = WorkingTree(self._root_dir) self.assertWalkEqual( tree.walk("."), [ ( self._root_dir, ["data_dir"], ["bar", "тест", "code.py", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), ], ) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir")), [( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], )], )
def test_nobranch(self): tree = WorkingTree(self._root_dir) dvcignore = DvcIgnoreFilter(self._root_dir, tree) self.assertWalkEqual( tree.walk(".", dvcignore=dvcignore), [ ( self._root_dir, ["data_dir"], ["bar", "тест", "code.py", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), ], ) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir"), dvcignore=dvcignore), [( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], )], )
def test_subdir(self): tree = WorkingTree(self._root_dir) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir")), [( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], )], )
def test(self): tree = WorkingTree() self.assertWalkEqual( tree.walk("."), [ (".", ["data_dir"], ["code.py", "bar", "тест", "foo"]), (join("data_dir"), ["data_sub_dir"], ["data"]), (join("data_dir", "data_sub_dir"), [], ["data_sub"]), ], )
def test_subdir(self): dvcignore = DvcIgnoreFilter(self.root_dir) tree = WorkingTree(self._root_dir) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir"), dvcignore=dvcignore), [( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], )], )
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.utils import makedirs root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) hardlink_lock = self.config.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore()
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self.cache.local.cache_dir) self.metrics = Metrics(self) self.params = Params(self) self._ignore()
def work_tree(self): # When using repo.brancher, repo.tree may change to/from WorkingTree to # GitTree arbitarily. When repo.tree is GitTree, local cache needs to # use its own WorkingTree instance. if self.repo: return WorkingTree(self.repo.root_dir) return None
def __init__(self, repo): self.repo = repo self.root_dir = repo.root_dir self.tree = WorkingTree(self.root_dir) state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) self.row_cleanup_quota = state_config.get( "row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA ) if not repo.tmp_dir: self.state_file = None return self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE) # https://www.sqlite.org/tempfiles.html self.temp_files = [ self.state_file + "-journal", self.state_file + "-wal", ] self.database = None self.cursor = None self.inserts = 0
def test_path_object_and_str_are_valid_types_get_mtime_and_size( path, repo_dir): tree = CleanTree(WorkingTree(repo_dir.root_dir)) time, size = get_mtime_and_size(path, tree) object_time, object_size = get_mtime_and_size(PathInfo(path), tree) assert time == object_time assert size == object_size
def test_ignore_collecting_dvcignores(repo_dir, dname): top_ignore_file = os.path.join(repo_dir.root_dir, os.path.dirname(dname), DvcIgnore.DVCIGNORE_FILE) repo_dir.create(top_ignore_file, os.path.basename(dname)) ignore_file = os.path.join(repo_dir.root_dir, dname, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, repo_dir.FOO) assert DvcIgnoreFilter(repo_dir.root_dir, WorkingTree(repo_dir.root_dir)).ignores == { DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnorePatterns( top_ignore_file, WorkingTree(repo_dir.root_dir)), }
def _check(self, branch, target, with_deps, expected): if branch: self.dvc.tree = GitTree(self.dvc.scm.git, branch) else: self.dvc.tree = WorkingTree() result = self.dvc.collect(target + ".dvc", with_deps=with_deps) self.assertEqual([[str(j) for j in i.outs] for i in result], expected) return result
def test_path_object_and_str_are_valid_types_get_mtime_and_size( path, repo_dir): dvcignore = DvcIgnoreFilter(repo_dir.root_dir, WorkingTree(repo_dir.root_dir)) time, size = get_mtime_and_size(path, dvcignore) object_time, object_size = get_mtime_and_size(PathInfo(path), dvcignore) assert time == object_time assert size == object_size
def __init__(self, root_dir=None): from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.repo.pkg import Pkg root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.tree = WorkingTree(self.root_dir) self.scm = SCM(self.root_dir, repo=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config.config) self.updater = Updater(self.dvc_dir) self.metrics = Metrics(self) self.tag = Tag(self) self.pkg = Pkg(self) self._ignore() self.updater.check()
def test(self): dvcignore = DvcIgnoreFilter(self.root_dir) tree = WorkingTree(self._root_dir) self.assertWalkEqual( tree.walk(self._root_dir, dvcignore=dvcignore), [ ( self._root_dir, ["data_dir"], ["code.py", "bar", "тест", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), ], )
def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): tmp_dir.gen({"dir": {"subdir": {}}}) top_ignore_file = (tmp_dir / dname).with_name(DvcIgnore.DVCIGNORE_FILE) top_ignore_file.write_text(os.path.basename(dname)) ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") assert dvc.tree.dvcignore.ignores == { DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnorePatterns(fspath(top_ignore_file), WorkingTree(dvc.root_dir)), }
def test(self): tree = CleanTree(WorkingTree(self.root_dir)) file_time, file_size = get_mtime_and_size(self.DATA, tree) dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, tree) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize( self.DATA_SUB) self.assertIs(type(file_time), str) self.assertIs(type(file_size), str) self.assertEqual(file_size, str(actual_file_size)) self.assertIs(type(dir_time), str) self.assertIs(type(dir_size), str) self.assertEqual(dir_size, str(actual_dir_size))
def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir): tmp_dir.gen( {"dir": {"dir_file": "dir file content"}, "file": "file_content"} ) tree = CleanTree(WorkingTree(tmp_dir)) time, size = get_mtime_and_size("dir", tree) object_time, object_size = get_mtime_and_size(PathInfo("dir"), tree) assert time == object_time assert size == object_size time, size = get_mtime_and_size("file", tree) object_time, object_size = get_mtime_and_size(PathInfo("file"), tree) assert time == object_time assert size == object_size
def test(self): dvcignore = DvcIgnoreFilter(self.root_dir, WorkingTree(self.root_dir)) file_time, file_size = get_mtime_and_size(self.DATA, dvcignore) dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, dvcignore) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize( self.DATA_SUB) self.assertIs(type(file_time), str) self.assertIs(type(file_size), str) self.assertEqual(file_size, str(actual_file_size)) self.assertIs(type(dir_time), str) self.assertIs(type(dir_size), str) self.assertEqual(dir_size, str(actual_dir_size))
def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): tmp_dir.gen({"dir": {"subdir": {}}}) top_ignore_file = (tmp_dir / dname).with_name(DvcIgnore.DVCIGNORE_FILE) top_ignore_file.write_text(os.path.basename(dname)) ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") assert len(dvc.tree.dvcignore.ignores) == 3 assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores assert (DvcIgnorePatterns(os.fspath(top_ignore_file), WorkingTree(dvc.root_dir)) in dvc.tree.dvcignore.ignores) assert any(i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo))
def _ls(repo, path_info, recursive=None, dvc=False): from dvc.ignore import CleanTree from dvc.repo.tree import DvcTree from dvc.scm.tree import WorkingTree if dvc: tree = DvcTree(repo) else: tree = CleanTree(WorkingTree(repo.root_dir)) ret = {} try: for root, dirs, files in tree.walk(path_info.fspath): for fname in files: info = PathInfo(root) / fname path = str(info.relative_to(path_info)) ret[path] = { "isout": dvc, "isdir": False, "isexec": False if dvc else tree.isexec(info.fspath), } if not recursive: for dname in dirs: info = PathInfo(root) / dname path = str(info.relative_to(path_info)) ret[path] = { "isout": tree.isdvc(info.fspath) if dvc else False, "isdir": True, "isexec": False if dvc else tree.isexec(info.fspath), } break except NotADirectoryError: return { path_info.name: { "isout": dvc, "isdir": False, "isexec": False if dvc else tree.isexec(path_info.fspath), } } except FileNotFoundError: return {} return ret
def open(self, path, mode="r", encoding="utf-8"): try: outs = self._find_outs(path, strict=False) except OutputNotFoundError as exc: raise FileNotFoundError from exc if len(outs) != 1 or outs[0].is_dir_checksum: raise IOError(errno.EISDIR) out = outs[0] # temporary hack to make cache use WorkingTree and not GitTree, because # cache dir doesn't exist in the latter. saved_tree = self.repo.tree self.repo.tree = WorkingTree(self.repo.root_dir) try: if out.changed_cache(): raise FileNotFoundError finally: self.repo.tree = saved_tree return open(out.cache_path, mode=mode, encoding=encoding)
def _ls_files_repo(path_info, recursive=None): from dvc.compat import fspath from dvc.ignore import CleanTree from dvc.path_info import PathInfo from dvc.scm.tree import WorkingTree if not os.path.exists(fspath(path_info)): return [] files = [] tree = CleanTree(WorkingTree(path_info)) try: for dirpath, dirnames, filenames in tree.walk(path_info): files.extend(PathInfo(dirpath, f) for f in filenames) if not recursive: files.extend(PathInfo(dirpath, d) for d in dirnames) break except NotADirectoryError: if os.path.isfile(fspath(path_info)): files = [path_info] return [_get_fs_node(f) for f in files]
def _ls_files_repo(target_path_info, recursive=None): from dvc.compat import fspath from dvc.ignore import CleanTree from dvc.path_info import PathInfo from dvc.scm.tree import WorkingTree if not os.path.exists(fspath(target_path_info)): return [] files = [] tree = CleanTree(WorkingTree(target_path_info)) try: for dirpath, dirnames, filenames in tree.walk(target_path_info): files.extend(map(lambda f: PathInfo(dirpath, f), filenames)) if not recursive: files.extend(map(lambda d: PathInfo(dirpath, d), dirnames)) break except NotADirectoryError: if os.path.isfile(fspath(target_path_info)): return [target_path_info] return files
def open(self, path, mode="r", encoding="utf-8", remote=None): try: outs = self._find_outs(path, strict=False) except OutputNotFoundError as exc: raise FileNotFoundError from exc if len(outs) != 1 or outs[0].is_dir_checksum: raise IsADirectoryError out = outs[0] # temporary hack to make cache use WorkingTree and not GitTree, because # cache dir doesn't exist in the latter. saved_tree = self.repo.tree self.repo.tree = WorkingTree(self.repo.root_dir) try: if out.changed_cache(): if not self.fetch and not self.stream: raise FileNotFoundError remote_obj = self.repo.cloud.get_remote(remote) if self.stream: try: remote_info = remote_obj.checksum_to_path_info( out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: pass with self.repo.state: cache_info = out.get_used_cache(remote=remote) self.repo.cloud.pull(cache_info, remote=remote) finally: self.repo.tree = saved_tree return open(out.cache_path, mode=mode, encoding=encoding)
def __init__( self, dvc_dir=None, validate=True, tree=None, ): # pylint: disable=super-init-not-called from dvc.scm.tree import WorkingTree self.dvc_dir = dvc_dir if not dvc_dir: try: from dvc.repo import Repo self.dvc_dir = os.path.join(Repo.find_dvc_dir()) except NotDvcRepoError: self.dvc_dir = None else: self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir)) self.wtree = WorkingTree(self.dvc_dir) self.tree = tree.tree if tree else self.wtree self.load(validate=validate)
class Repo(object): DVC_DIR = ".dvc" from dvc.repo.destroy import destroy from dvc.repo.install import install from dvc.repo.add import add from dvc.repo.remove import remove from dvc.repo.lock import lock as lock_stage from dvc.repo.move import move from dvc.repo.run import run from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.reproduce import reproduce from dvc.repo.checkout import _checkout from dvc.repo.push import push from dvc.repo.fetch import _fetch from dvc.repo.pull import pull from dvc.repo.status import status from dvc.repo.gc import gc from dvc.repo.commit import commit from dvc.repo.diff import diff from dvc.repo.brancher import brancher from dvc.repo.get import get from dvc.repo.get_url import get_url from dvc.repo.update import update def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.tree = WorkingTree(self.root_dir) self.lock = Lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore() @property def tree(self): return self._tree @tree.setter def tree(self, tree): self._tree = tree # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset() def __repr__(self): return "Repo: '{root_dir}'".format(root_dir=self.root_dir) @classmethod def find_root(cls, root=None): if root is None: root = os.getcwd() else: root = os.path.abspath(os.path.realpath(root)) while True: dvc_dir = os.path.join(root, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root if os.path.ismount(root): break root = os.path.dirname(root) raise NotDvcRepoError(root) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force) return Repo(root_dir) def unprotect(self, target): return self.cache.local.unprotect(PathInfo(target)) def _ignore(self): from dvc.updater import Updater updater = Updater(self.dvc_dir) flist = ([self.config.config_local_file, updater.updater_file] + self.state.files + self.lock.files + updater.lock.files) if self.cache.local.cache_dir.startswith(self.root_dir + os.sep): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" self._collect_graph(self.stages + new_stages) def collect(self, target, with_deps=False, recursive=False, graph=None): import networkx as nx from dvc.stage import Stage G = graph or self.graph if not target: return get_stages(G) target = os.path.abspath(target) if recursive and os.path.isdir(target): attrs = nx.get_node_attributes(G, "stage") nodes = [node for node in nx.dfs_postorder_nodes(G)] ret = [] for node in nodes: stage = attrs[node] if stage.path.startswith(target + os.sep): ret.append(stage) return ret stage = Stage.load(self, target) if not with_deps: return [stage] node = relpath(stage.path, self.root_dir) pipeline = get_pipeline(get_pipelines(G), node) return [ pipeline.node[n]["stage"] for n in nx.dfs_postorder_nodes(pipeline, node) ] def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches` or `all_tags` to expand scope. Returns: A dictionary with Schemes (representing output's location) as keys, and a list with the outputs' `dumpd` as values. """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): if targets: stages = [] for target in targets: collected = self.collect(target, recursive=recursive, with_deps=with_deps) stages.extend(collected) else: stages = self.stages for stage in stages: if stage.is_repo_import: dep, = stage.deps cache.external[dep.repo_pair].add(dep.def_path) continue for out in stage.outs: used_cache = out.get_used_cache(remote=remote, force=force, jobs=jobs) suffix = "({})".format(branch) if branch else "" cache.update(used_cache, suffix=suffix) return cache def _collect_graph(self, stages=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.collect_stages() stages = [stage for stage in stages if stage] outs = {} for stage in stages: for out in stage.outs: if out.path_info in outs: stages = [stage.relpath, outs[out.path_info].stage.relpath] raise OutputDuplicationError(str(out), stages) outs[out.path_info] = out for stage in stages: for out in stage.outs: for p in out.path_info.parents: if p in outs: raise OverlappingOutputPathsError(outs[p], out) for stage in stages: stage_path_info = PathInfo(stage.path) for p in chain([stage_path_info], stage_path_info.parents): if p in outs: raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) for dep in stage.deps: if dep.path_info is None: continue for out in outs: if (out == dep.path_info or dep.path_info.isin(out) or out.isin(dep.path_info)): dep_stage = outs[out].stage dep_node = relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) check_acyclic(G) return G @cached_property def graph(self): return self._collect_graph() @cached_property def pipelines(self): return get_pipelines(self.graph) @staticmethod def _filter_out_dirs(dirs, outs, root_dir): def filter_dirs(dname): path = os.path.join(root_dir, dname) for out in outs: if path == os.path.normpath(out): return False return True return list(filter(filter_dirs, dirs)) def collect_stages(self): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ from dvc.stage import Stage stages = [] outs = [] for root, dirs, files in self.tree.walk(self.root_dir, dvcignore=self.dvcignore): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue stage = Stage.load(self, path) for out in stage.outs: if out.scheme == "local": outs.append(out.fspath + out.sep) stages.append(stage) dirs[:] = self._filter_out_dirs(dirs, outs, root) return stages @cached_property def stages(self): return get_stages(self.graph) def find_outs_by_path(self, path, outs=None, recursive=False): if not outs: outs = [out for stage in self.stages for out in stage.outs] abs_path = os.path.abspath(path) is_dir = self.tree.isdir(abs_path) def func(out): if out.scheme == "local" and out.fspath == abs_path: return True if is_dir and recursive and out.path_info.isin(abs_path): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path) return matched def find_out_by_relpath(self, relpath): path = os.path.join(self.root_dir, relpath) out, = self.find_outs_by_path(path) return out def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @contextmanager def open(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" cause = None try: out, = self.find_outs_by_path(path) except OutputNotFoundError as e: out = None cause = e if out and out.use_cache: try: with self._open_cached(out, remote, mode, encoding) as fd: yield fd return except FileNotFoundError as e: raise FileMissingError(relpath(path, self.root_dir), cause=e) if self.tree.exists(path): with self.tree.open(path, mode, encoding) as fd: yield fd return raise FileMissingError(relpath(path, self.root_dir), cause=cause) def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): raise ValueError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return _open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) return _open(cache_file, mode=mode, encoding=encoding) @cached_property def dvcignore(self): return DvcIgnoreFilter(self.root_dir) def close(self): self.scm.close() @locked def checkout(self, *args, **kwargs): return self._checkout(*args, **kwargs) @locked def fetch(self, *args, **kwargs): return self._fetch(*args, **kwargs) def _reset(self): self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None) self.__dict__.pop("dvcignore", None)
class Repo(object): DVC_DIR = ".dvc" from dvc.repo.destroy import destroy from dvc.repo.install import install from dvc.repo.add import add from dvc.repo.remove import remove from dvc.repo.lock import lock as lock_stage from dvc.repo.move import move from dvc.repo.run import run from dvc.repo.imp import imp from dvc.repo.reproduce import reproduce from dvc.repo.checkout import checkout from dvc.repo.push import push from dvc.repo.fetch import fetch from dvc.repo.pull import pull from dvc.repo.status import status from dvc.repo.gc import gc from dvc.repo.commit import commit from dvc.repo.diff import diff from dvc.repo.brancher import brancher def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.repo.pkg import Pkg root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.tree = WorkingTree(self.root_dir) self.scm = SCM(self.root_dir, repo=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self.pkg = Pkg(self) self._ignore() def __repr__(self): return "Repo: '{root_dir}'".format(root_dir=self.root_dir) @classmethod def find_root(cls, root=None): if root is None: root = os.getcwd() else: root = os.path.abspath(os.path.realpath(root)) while True: dvc_dir = os.path.join(root, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root if os.path.ismount(root): break root = os.path.dirname(root) raise NotDvcRepoError(root) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force) return Repo(root_dir) def unprotect(self, target): return self.cache.local.unprotect(PathInfo(target)) def _ignore(self): from dvc.updater import Updater updater = Updater(self.dvc_dir) flist = [ self.state.state_file, self.lock.lock_file, self.config.config_local_file, updater.updater_file, updater.lock.lock_file, ] + self.state.temp_files if self.cache.local.cache_dir.startswith(self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def check_dag(self, stages): """Generate graph including the new stage to check for errors""" self.graph(stages=stages) @staticmethod def _check_cyclic_graph(graph): import networkx as nx from dvc.exceptions import CyclicGraphError cycles = list(nx.simple_cycles(graph)) if cycles: raise CyclicGraphError(cycles[0]) def _get_pipeline(self, node): pipelines = [i for i in self.pipelines() if i.has_node(node)] assert len(pipelines) == 1 return pipelines[0] def collect(self, target, with_deps=False, recursive=False): import networkx as nx from dvc.stage import Stage if not target or (recursive and os.path.isdir(target)): return self.active_stages(target) stage = Stage.load(self, target) if not with_deps: return [stage] node = os.path.relpath(stage.path, self.root_dir) G = self._get_pipeline(node) ret = [] for n in nx.dfs_postorder_nodes(G, node): ret.append(G.node[n]["stage"]) return ret def _collect_dir_cache(self, out, branch=None, remote=None, force=False, jobs=None): """Get a list of `info`s retaled to the given directory. - Pull the directory entry from the remote cache if it was changed. Example: Given the following commands: $ echo "foo" > directory/foo $ echo "bar" > directory/bar $ dvc add directory It will return something similar to the following list: [ { 'path': 'directory', 'md5': '168fd6761b9c.dir', ... }, { 'path': 'directory/foo', 'md5': 'c157a79031e1', ... }, { 'path': 'directory/bar', 'md5': 'd3b07384d113', ... }, ] """ info = out.dumpd() ret = [info] r = out.remote md5 = info[r.PARAM_CHECKSUM] if self.cache.local.changed_cache_file(md5): try: self.cloud.pull(ret, jobs=jobs, remote=remote, show_checksums=False) except DvcException as exc: msg = "Failed to pull cache for '{}': {}" logger.debug(msg.format(out, exc)) if self.cache.local.changed_cache_file(md5): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force. ") if not force and not prompt.confirm(msg): raise DvcException( "unable to fully collect used cache" " without cache for directory '{}'".format(out)) else: return ret for i in out.dir_cache: i["branch"] = branch i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH], i[r.PARAM_RELPATH]) ret.append(i) return ret def _collect_used_cache(self, out, branch=None, remote=None, force=False, jobs=None): """Get a dumpd of the given `out`, with an entry including the branch. The `used_cache` of an output is no more than its `info`. In case that the given output is a directory, it will also include the `info` of its files. """ if not out.use_cache or not out.info: if not out.info: logger.warning("Output '{}'({}) is missing version " "info. Cache for it will not be collected. " "Use dvc repro to get your pipeline up to " "date.".format(out, out.stage)) return [] info = out.dumpd() info["branch"] = branch ret = [info] if out.scheme != "local": return ret if not out.is_dir_checksum: return ret return self._collect_dir_cache(out, branch=branch, remote=remote, force=force, jobs=jobs) def used_cache( self, target=None, all_branches=False, active=True, with_deps=False, all_tags=False, remote=None, force=False, jobs=None, recursive=False, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches` or `all_tags` to expand scope. Returns: A dictionary with Schemes (representing output's location) as keys, and a list with the outputs' `dumpd` as values. """ cache = {} cache["local"] = [] cache["s3"] = [] cache["gs"] = [] cache["hdfs"] = [] cache["ssh"] = [] cache["azure"] = [] for branch in self.brancher(all_branches=all_branches, all_tags=all_tags): if target: if recursive and os.path.isdir(target): stages = self.stages(target) else: stages = self.collect(target, with_deps=with_deps) elif active: stages = self.active_stages() else: stages = self.stages() for stage in stages: if active and not target and stage.locked: logger.warning( "DVC-file '{path}' is locked. Its dependencies are" " not going to be pushed/pulled/fetched.".format( path=stage.relpath)) for out in stage.outs: scheme = out.path_info.scheme cache[scheme].extend( self._collect_used_cache( out, branch=branch, remote=remote, force=force, jobs=jobs, )) return cache def graph(self, stages=None, from_directory=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, use the ones on the `from_directory`. from_directory (str): directory where to look at for stages, if None is given, use the current working directory Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path_info == out.path_info: existing.append(o.stage) in_o_dir = out.path_info.isin(o.path_info) in_out_dir = o.path_info.isin(out.path_info) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(str(out), stages) outs.append(out) for stage in stages: stage_path_info = PathInfo(stage.path) for out in outs: if stage_path_info.isin(out.path_info): raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path_info != dep.path_info and not dep.path_info.isin(out.path_info) and not out.path_info.isin(dep.path_info)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active def pipelines(self, from_directory=None): import networkx as nx G, G_active = self.graph(from_directory=from_directory) return [ G.subgraph(c).copy() for c in nx.weakly_connected_components(G) ] def stages(self, from_directory=None, check_dag=True): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ from dvc.stage import Stage if not from_directory: from_directory = self.root_dir elif not os.path.isdir(from_directory): raise TargetNotDirectoryError(from_directory) stages = [] outs = [] ignore_file_handler = DvcIgnoreFileHandler(self.tree) for root, dirs, files in self.tree.walk( from_directory, ignore_file_handler=ignore_file_handler): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue stage = Stage.load(self, path) for out in stage.outs: if out.scheme == "local": outs.append(out.fspath + out.sep) stages.append(stage) def filter_dirs(dname, root=root): path = os.path.join(root, dname) if path in (self.dvc_dir, self.scm.dir): return False for out in outs: if path == os.path.normpath(out) or path.startswith(out): return False return True dirs[:] = list(filter(filter_dirs, dirs)) if check_dag: self.check_dag(stages) return stages def active_stages(self, from_directory=None): import networkx as nx stages = [] for G in self.pipelines(from_directory): stages.extend(list(nx.get_node_attributes(G, "stage").values())) return stages def find_outs_by_path(self, path, outs=None, recursive=False): if not outs: # there is no `from_directory=path` argument because some data # files might be generated to an upper level, and so it is # needed to look at all the files (project root_dir) stages = self.stages() outs = [out for stage in stages for out in stage.outs] abs_path = os.path.abspath(path) is_dir = self.tree.isdir(abs_path) def func(out): if out.scheme == "local" and out.fspath == abs_path: return True if is_dir and recursive and out.path_info.isin(abs_path): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path) return matched def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts def open(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" out, = self.find_outs_by_path(path) if out.isdir(): raise ValueError("Can't open a dir") with self.state: cache_info = self._collect_used_cache(out, remote=remote) self.cloud.pull(cache_info, remote=remote) cache_file = self.cache.local.checksum_to_path_info(out.checksum) return _open(cache_file.fspath, mode=mode, encoding=encoding)
class Repo(object): DVC_DIR = ".dvc" from dvc.repo.destroy import destroy from dvc.repo.install import install from dvc.repo.add import add from dvc.repo.remove import remove from dvc.repo.ls import ls from dvc.repo.lock import lock as lock_stage from dvc.repo.move import move from dvc.repo.run import run from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.reproduce import reproduce from dvc.repo.checkout import _checkout from dvc.repo.push import push from dvc.repo.fetch import _fetch from dvc.repo.pull import pull from dvc.repo.status import status from dvc.repo.gc import gc from dvc.repo.commit import commit from dvc.repo.diff import diff from dvc.repo.brancher import brancher from dvc.repo.get import get from dvc.repo.get_url import get_url from dvc.repo.update import update def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.utils.fs import makedirs root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore() @property def tree(self): return self._tree @tree.setter def tree(self, tree): self._tree = tree if isinstance(tree, CleanTree) else CleanTree(tree) # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset() def __repr__(self): return "{}: '{}'".format(self.__class__.__name__, self.root_dir) @classmethod def find_root(cls, root=None): root_dir = os.path.realpath(root or os.curdir) if not os.path.isdir(root_dir): raise NotDvcRepoError("directory '{}' does not exist".format(root)) while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root_dir if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) message = ("you are not inside of a DVC repository " "(checked up to mount point '{}')").format(root_dir) raise NotDvcRepoError(message) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir) return Repo(root_dir) def unprotect(self, target): return self.cache.local.unprotect(PathInfo(target)) def _ignore(self): from dvc.updater import Updater updater = Updater(self.dvc_dir) flist = ([self.config.files["local"], updater.updater_file] + [self.lock.lockfile, updater.lock.lockfile, self.tmp_dir] + self.state.files) if path_isin(self.cache.local.cache_dir, self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" self._collect_graph(self.stages + new_stages) def collect(self, target, with_deps=False, recursive=False, graph=None): import networkx as nx from dvc.stage import Stage G = graph or self.graph if not target: return list(G) target = os.path.abspath(target) if recursive and os.path.isdir(target): stages = nx.dfs_postorder_nodes(G) return [stage for stage in stages if path_isin(stage.path, target)] stage = Stage.load(self, target) if not with_deps: return [stage] pipeline = get_pipeline(get_pipelines(G), stage) return list(nx.dfs_postorder_nodes(pipeline, stage)) def collect_granular(self, target, *args, **kwargs): if not target: return [(stage, None) for stage in self.stages] try: (out, ) = self.find_outs_by_path(target, strict=False) filter_info = PathInfo(os.path.abspath(target)) return [(out.stage, filter_info)] except OutputNotFoundError: stages = self.collect(target, *args, **kwargs) return [(stage, None) for stage in stages] def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches` or `all_tags` to expand scope. Returns: A dictionary with Schemes (representing output's location) as keys, and a list with the outputs' `dumpd` as values. """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): targets = targets or [None] pairs = cat( self.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = "({})".format(branch) if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) return cache def _collect_graph(self, stages=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.stages stages = [stage for stage in stages if stage] outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in stages: for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.relpath, str(overlapping), overlapping.stage.relpath, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = list(n.value for n in outs.prefixes(dep_key)) if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G @cached_property def graph(self): return self._collect_graph() @cached_property def pipelines(self): return get_pipelines(self.graph) @cached_property def stages(self): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ from dvc.stage import Stage stages = [] outs = set() for root, dirs, files in self.tree.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue stage = Stage.load(self, path) stages.append(stage) for out in stage.outs: if out.scheme == "local": outs.add(out.fspath) dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs] return stages def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): if not outs: outs = [out for stage in self.stages for out in stage.outs] abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) match = path_info.__eq__ if strict else path_info.isin_or_eq def func(out): if out.scheme == "local" and match(out.path_info): return True if recursive and out.path_info.isin(path_info): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def find_out_by_relpath(self, relpath): path = os.path.join(self.root_dir, relpath) (out, ) = self.find_outs_by_path(path) return out def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" cause = None try: out = self.find_out_by_relpath(path) except OutputNotFoundError as exc: out = None cause = exc if out and out.use_cache: try: with self._open_cached(out, remote, mode, encoding) as fd: yield fd return except FileNotFoundError as exc: raise FileMissingError(path) from exc abs_path = os.path.join(self.root_dir, path) if os.path.exists(abs_path): with open(abs_path, mode=mode, encoding=encoding) as fd: yield fd return raise FileMissingError(path) from cause def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): raise ValueError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) return open(cache_file, mode=mode, encoding=encoding) def close(self): self.scm.close() @locked def checkout(self, *args, **kwargs): return self._checkout(*args, **kwargs) @locked def fetch(self, *args, **kwargs): return self._fetch(*args, **kwargs) def _reset(self): self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None) self.__dict__.pop("dvcignore", None)