def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = WorkingTree(self.root_dir) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def test_unhashable(tmp_dir, dvc, mocker, kwargs): from dvc.stage import Stage, create_stage from dvc.stage.cache import RunCacheNotFoundError, StageCache cache = StageCache(dvc) stage = create_stage(Stage, path="stage.dvc", repo=dvc, **kwargs) get_stage_hash = mocker.patch("dvc.stage.cache._get_stage_hash") assert cache.save(stage) is None assert get_stage_hash.not_called with pytest.raises(RunCacheNotFoundError): cache.restore(stage) assert get_stage_hash.not_called
def test_always_changed(mocker): from dvc.repo import Repo from dvc.stage import Stage from dvc.stage.cache import RunCacheNotFoundError, StageCache repo = mocker.Mock(spec=Repo) cache = StageCache(repo) stage = Stage(repo, always_changed=True) get_stage_hash = mocker.patch("dvc.stage.cache._get_stage_hash") assert cache.save(stage) is None assert get_stage_hash.not_called with pytest.raises(RunCacheNotFoundError): cache.restore(stage) assert get_stage_hash.not_called
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.params = Params(self) self._ignore()
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self)
class Repo: DVC_DIR = ".dvc" from dvc.repo.add import add from dvc.repo.brancher import brancher from dvc.repo.checkout import checkout from dvc.repo.commit import commit from dvc.repo.destroy import destroy from dvc.repo.diff import diff from dvc.repo.fetch import fetch from dvc.repo.freeze import freeze, unfreeze from dvc.repo.gc import gc from dvc.repo.get import get from dvc.repo.get_url import get_url from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.install import install from dvc.repo.ls import ls from dvc.repo.move import move from dvc.repo.pull import pull from dvc.repo.push import push from dvc.repo.remove import remove from dvc.repo.reproduce import reproduce from dvc.repo.run import run from dvc.repo.status import status from dvc.repo.update import update def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) @cached_property def scm(self): from dvc.scm import SCM no_scm = self.config["core"].get("no_scm", False) return self._scm if self._scm else SCM(self.root_dir, no_scm=no_scm) @property def tree(self): return self._tree @tree.setter def tree(self, tree): self._tree = tree # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset() def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" @classmethod def find_root(cls, root=None, tree=None): root_dir = os.path.realpath(root or os.curdir) if tree: if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): return root_dir raise NotDvcRepoError(f"'{root}' does not contain DVC directory") if not os.path.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root_dir if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) message = ("you are not inside of a DVC repository " "(checked up to mount point '{}')").format(root_dir) raise NotDvcRepoError(message) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir) return Repo(root_dir) def unprotect(self, target): return self.cache.local.tree.unprotect(PathInfo(target)) def _ignore(self): flist = [ self.config.files["local"], self.tmp_dir, ] if self.experiments: flist.append(self.experiments.exp_dir) if path_isin(self.cache.local.cache_dir, self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def get_stage(self, path=None, name=None): if not path: path = PIPELINE_FILE logger.debug("Assuming '%s' to be a stage inside '%s'", name, path) dvcfile = Dvcfile(self, path) return dvcfile.stages[name] def get_stages(self, path=None, name=None): if not path: path = PIPELINE_FILE logger.debug("Assuming '%s' to be a stage inside '%s'", name, path) if name: return [self.get_stage(path, name)] dvcfile = Dvcfile(self, path) return list(dvcfile.stages.values()) def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" # Building graph might be costly for the ones with many DVC-files, # so we provide this undocumented hack to skip it. See [1] for # more details. The hack can be used as: # # repo = Repo(...) # repo._skip_graph_checks = True # repo.add(...) # # A user should care about not duplicating outs and not adding cycles, # otherwise DVC might have an undefined behaviour. # # [1] https://github.com/iterative/dvc/issues/2671 if not getattr(self, "_skip_graph_checks", False): self._collect_graph(self.stages + new_stages) def _collect_inside(self, path, graph): import networkx as nx stages = nx.dfs_postorder_nodes(graph) return [stage for stage in stages if path_isin(stage.path, path)] def collect(self, target=None, with_deps=False, recursive=False, graph=None): if not target: return list(graph) if graph else self.stages if recursive and os.path.isdir(target): return self._collect_inside(os.path.abspath(target), graph or self.graph) path, name = parse_target(target) stages = self.get_stages(path, name) if not with_deps: return stages res = set() for stage in stages: res.update(self._collect_pipeline(stage, graph=graph)) return res def _collect_pipeline(self, stage, graph=None): import networkx as nx pipeline = get_pipeline(get_pipelines(graph or self.graph), stage) return nx.dfs_postorder_nodes(pipeline, stage) def _collect_from_default_dvcfile(self, target): dvcfile = Dvcfile(self, PIPELINE_FILE) if dvcfile.exists(): return dvcfile.stages.get(target) def collect_granular(self, target=None, with_deps=False, recursive=False, graph=None): """ Priority is in the order of following in case of ambiguity: - .dvc file or .yaml file - dir if recursive and directory exists - stage_name - output file """ if not target: return [(stage, None) for stage in self.stages] file, name = parse_target(target) stages = [] # Optimization: do not collect the graph for a specific target if not file: # parsing is ambiguous when it does not have a colon # or if it's not a dvcfile, as it can be a stage name # in `dvc.yaml` or, an output in a stage. logger.debug("Checking if stage '%s' is in '%s'", target, PIPELINE_FILE) if not (recursive and os.path.isdir(target)): stage = self._collect_from_default_dvcfile(target) if stage: stages = (self._collect_pipeline(stage) if with_deps else [stage]) elif not with_deps and is_valid_filename(file): stages = self.get_stages(file, name) if not stages: if not (recursive and os.path.isdir(target)): try: (out, ) = self.find_outs_by_path(target, strict=False) filter_info = PathInfo(os.path.abspath(target)) return [(out.stage, filter_info)] except OutputNotFoundError: pass try: stages = self.collect(target, with_deps, recursive, graph) except StageFileDoesNotExistError as exc: # collect() might try to use `target` as a stage name # and throw error that dvc.yaml does not exist, whereas it # should say that both stage name and file does not exist. if file and is_valid_filename(file): raise raise NoOutputOrStageError(target, exc.file) from exc except StageNotFound as exc: raise NoOutputOrStageError(target, exc.file) from exc return [(stage, None) for stage in stages] def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): targets = targets or [None] pairs = cat( self.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = f"({branch})" if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) if used_run_cache: used_cache = self.stage_cache.get_used_cache( used_run_cache, remote=remote, force=force, jobs=jobs, ) cache.update(used_cache) return cache def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G @cached_property def graph(self): return self._collect_graph(self.stages) @cached_property def pipelines(self): return get_pipelines(self.graph) @cached_property def stages(self): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ return self._collect_stages() def _collect_stages(self): stages = [] outs = set() for root, dirs, files in self.tree.walk(self.root_dir): for file_name in filter(is_valid_filename, files): new_stages = self.get_stages(os.path.join(root, file_name)) stages.extend(new_stages) outs.update(out.fspath for stage in new_stages for out in stage.outs if out.scheme == "local") dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs] return stages def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): if not outs: outs = [out for stage in self.stages for out in stage.outs] abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) match = path_info.__eq__ if strict else path_info.isin_or_eq def func(out): if out.scheme == "local" and match(out.path_info): return True if recursive and out.path_info.isin(path_info): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def find_out_by_relpath(self, relpath): path = os.path.join(self.root_dir, relpath) (out, ) = self.find_outs_by_path(path) return out def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @cached_property def repo_tree(self): return RepoTree(self, subrepos=self.subrepos, fetch=True) @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" tree = RepoTree(self, stream=True, subrepos=True) path = PathInfo(self.root_dir) / path try: with self.state: with tree.open( path, mode=mode, encoding=encoding, remote=remote, ) as fobj: yield fobj except FileNotFoundError as exc: raise FileMissingError(path) from exc except IsADirectoryError as exc: raise DvcIsADirectoryError(f"'{path}' is a directory") from exc def close(self): self.scm.close() def _reset(self): self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None)
class Repo: DVC_DIR = ".dvc" from dvc.repo.add import add from dvc.repo.brancher import brancher from dvc.repo.checkout import checkout from dvc.repo.commit import commit from dvc.repo.destroy import destroy from dvc.repo.diff import diff from dvc.repo.fetch import fetch from dvc.repo.freeze import freeze, unfreeze from dvc.repo.gc import gc from dvc.repo.get import get as _get from dvc.repo.get_url import get_url as _get_url from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.install import install from dvc.repo.ls import ls as _ls from dvc.repo.move import move from dvc.repo.pull import pull from dvc.repo.push import push from dvc.repo.remove import remove from dvc.repo.reproduce import reproduce from dvc.repo.run import run from dvc.repo.status import status from dvc.repo.update import update ls = staticmethod(_ls) get = staticmethod(_get) get_url = staticmethod(_get_url) def _get_repo_dirs( self, root_dir: str = None, scm: Base = None, rev: str = None, uninitialized: bool = False, ): assert bool(scm) == bool(rev) from dvc.scm import SCM from dvc.scm.git import Git from dvc.utils.fs import makedirs dvc_dir = None tmp_dir = None try: tree = scm.get_tree(rev) if isinstance(scm, Git) and rev else None root_dir = self.find_root(root_dir, tree) dvc_dir = os.path.join(root_dir, self.DVC_DIR) tmp_dir = os.path.join(dvc_dir, "tmp") makedirs(tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise try: scm = SCM(root_dir or os.curdir) except (SCMError, InvalidGitRepositoryError): scm = SCM(os.curdir, no_scm=True) assert isinstance(scm, Base) root_dir = scm.root_dir return root_dir, dvc_dir, tmp_dir def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler = None self._lock_depth = 0 @cached_property def scm(self): from dvc.scm import SCM no_scm = self.config["core"].get("no_scm", False) return self._scm if self._scm else SCM(self.root_dir, no_scm=no_scm) @cached_property def experiments(self): from dvc.repo.experiments import Experiments return Experiments(self) @property def tree(self) -> "BaseTree": return self._tree @tree.setter def tree(self, tree: "BaseTree"): self._tree = tree # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset() def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" @classmethod def find_root(cls, root=None, tree=None) -> str: root_dir = os.path.realpath(root or os.curdir) if tree: if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): return root_dir raise NotDvcRepoError(f"'{root}' does not contain DVC directory") if not os.path.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root_dir if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) message = ( "you are not inside of a DVC repository " "(checked up to mount point '{}')" ).format(root_dir) raise NotDvcRepoError(message) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir) return Repo(root_dir) def unprotect(self, target): return self.cache.local.tree.unprotect(PathInfo(target)) def _ignore(self): flist = [ self.config.files["local"], self.tmp_dir, ] if path_isin(self.cache.local.cache_dir, self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" # Building graph might be costly for the ones with many DVC-files, # so we provide this undocumented hack to skip it. See [1] for # more details. The hack can be used as: # # repo = Repo(...) # repo._skip_graph_checks = True # repo.add(...) # # A user should care about not duplicating outs and not adding cycles, # otherwise DVC might have an undefined behaviour. # # [1] https://github.com/iterative/dvc/issues/2671 if not getattr(self, "_skip_graph_checks", False): build_graph(self.stages + new_stages) def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): targets = targets or [None] pairs = cat( self.stage.collect_granular( target, recursive=recursive, with_deps=with_deps ) for target in targets ) suffix = f"({branch})" if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) if used_run_cache: used_cache = self.stage_cache.get_used_cache( used_run_cache, remote=remote, force=force, jobs=jobs, ) cache.update(used_cache) return cache @cached_property def outs_trie(self): return build_outs_trie(self.stages) @cached_property def graph(self): return build_graph(self.stages, self.outs_trie) @cached_property def outs_graph(self): return build_outs_graph(self.graph, self.outs_trie) @cached_property def pipelines(self): return get_pipelines(self.graph) @cached_property def stages(self): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ error_handler = self.stage_collection_error_handler return self.stage.collect_repo(onerror=error_handler) def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): if not outs: outs = [out for stage in self.stages for out in stage.outs] abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) match = path_info.__eq__ if strict else path_info.isin_or_eq def func(out): if out.scheme == "local" and match(out.path_info): return True if recursive and out.path_info.isin(path_info): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def find_out_by_relpath(self, relpath): path = os.path.join(self.root_dir, relpath) (out,) = self.find_outs_by_path(path) return out def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @cached_property def repo_tree(self): return RepoTree(self, subrepos=self.subrepos, fetch=True) @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" tree = RepoTree(self, stream=True, subrepos=True) path = PathInfo(self.root_dir) / path try: with self.state: with tree.open( path, mode=mode, encoding=encoding, remote=remote, ) as fobj: yield fobj except FileNotFoundError as exc: raise FileMissingError(path) from exc except IsADirectoryError as exc: raise DvcIsADirectoryError(f"'{path}' is a directory") from exc def close(self): self.scm.close() def _reset(self): self.__dict__.pop("outs_trie", None) self.__dict__.pop("outs_graph", None) self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None)
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler = None self._lock_depth = 0
class Repo: DVC_DIR = ".dvc" from dvc.repo.add import add from dvc.repo.checkout import checkout from dvc.repo.commit import commit from dvc.repo.destroy import destroy from dvc.repo.diff import diff from dvc.repo.fetch import fetch from dvc.repo.freeze import freeze, unfreeze from dvc.repo.gc import gc from dvc.repo.get import get as _get from dvc.repo.get_url import get_url as _get_url from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.install import install from dvc.repo.ls import ls as _ls from dvc.repo.move import move from dvc.repo.pull import pull from dvc.repo.push import push from dvc.repo.remove import remove from dvc.repo.reproduce import reproduce from dvc.repo.run import run from dvc.repo.status import status from dvc.repo.update import update ls = staticmethod(_ls) get = staticmethod(_get) get_url = staticmethod(_get_url) def _get_repo_dirs( self, root_dir: str = None, scm: "Base" = None, rev: str = None, uninitialized: bool = False, ): assert bool(scm) == bool(rev) from dvc.scm import SCM from dvc.scm.base import SCMError from dvc.scm.git import Git from dvc.utils.fs import makedirs dvc_dir = None tmp_dir = None try: fs = scm.get_fs(rev) if isinstance(scm, Git) and rev else None root_dir = self.find_root(root_dir, fs) dvc_dir = os.path.join(root_dir, self.DVC_DIR) tmp_dir = os.path.join(dvc_dir, "tmp") makedirs(tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise try: scm = SCM(root_dir or os.curdir) except SCMError: scm = SCM(os.curdir, no_scm=True) from dvc.scm import Base assert isinstance(scm, Base) root_dir = scm.root_dir return root_dir, dvc_dir, tmp_dir def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs.local import LocalFileSystem from dvc.lock import LockNoop, make_lock from dvc.machine import MachineManager from dvc.objects.db import ODBManager from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} if rev and not scm: scm = SCM(root_dir or os.curdir) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) if scm: self._fs = scm.get_fs(rev) else: self._fs = LocalFileSystem(url=self.root_dir) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized self._scm = scm # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.root_dir, self.tmp_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) if self.tmp_dir and ( self.config["feature"].get("machine", False) or env2bool("DVC_TEST") ): self.machine = MachineManager(self) else: self.machine = None self.stage_collection_error_handler: Optional[ Callable[[str, Exception], None] ] = None self._lock_depth = 0 def __str__(self): return self.url or self.root_dir @cached_property def index(self): from dvc.repo.index import Index return Index(self) @staticmethod def open(url, *args, **kwargs): if url is None: url = os.getcwd() if os.path.exists(url): try: return Repo(url, *args, **kwargs) except NotDvcRepoError: pass # fallthrough to external_repo from dvc.external_repo import external_repo return external_repo(url, *args, **kwargs) @cached_property def scm(self): from dvc.scm import SCM from dvc.scm.base import SCMError if self._scm: return self._scm no_scm = self.config["core"].get("no_scm", False) try: return SCM(self.root_dir, no_scm=no_scm) except SCMError: if self._uninitialized: # might not be a git/dvc repo at all # used in `params/metrics/plots/live` targets return SCM(self.root_dir, no_scm=True) raise @cached_property def dvcignore(self) -> DvcIgnoreFilter: return DvcIgnoreFilter(self.fs, self.root_dir) def get_rev(self): from dvc.fs.local import LocalFileSystem assert self.scm if isinstance(self.fs, LocalFileSystem): return self.scm.get_rev() return self.fs.rev @cached_property def experiments(self): from dvc.repo.experiments import Experiments return Experiments(self) @property def fs(self) -> "BaseFileSystem": return self._fs @fs.setter def fs(self, fs: "BaseFileSystem"): self._fs = fs # Our graph cache is no longer valid, as it was based on the previous # fs. self._reset() def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" @classmethod def find_root(cls, root=None, fs=None) -> str: root_dir = os.path.realpath(root or os.curdir) if fs: if fs.isdir(os.path.join(root_dir, cls.DVC_DIR)): return root_dir raise NotDvcRepoError(f"'{root}' does not contain DVC directory") if not os.path.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root_dir if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) message = ( "you are not inside of a DVC repository " "(checked up to mount point '{}')" ).format(root_dir) raise NotDvcRepoError(message) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init return init( root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir ) def unprotect(self, target): return self.odb.local.unprotect(PathInfo(target)) def _ignore(self): flist = [self.config.files["local"], self.tmp_dir] if path_isin(self.odb.local.cache_dir, self.root_dir): flist += [self.odb.local.cache_dir] self.scm.ignore_list(flist) def brancher(self, *args, **kwargs): from dvc.repo.brancher import brancher return brancher(self, *args, **kwargs) def used_objs( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits`/`all_experiments` to expand the scope. Returns: A dict mapping (remote) ODB instances to sets of objects that belong to each ODB. If the ODB instance is None, the objects are naive and do not belong to a specific remote ODB. """ used = defaultdict(set) def _add_suffix(objs: Set["HashFile"], suffix: str) -> None: from itertools import chain from dvc.objects import iterobjs for obj in chain.from_iterable(map(iterobjs, objs)): if obj.name is not None: obj.name += suffix for branch in self.brancher( revs=revs, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, ): for odb, objs in self.index.used_objs( targets, remote=remote, force=force, jobs=jobs, recursive=recursive, with_deps=with_deps, ).items(): if branch: _add_suffix(objs, f" ({branch})") used[odb].update(objs) if used_run_cache: for odb, objs in self.stage_cache.get_used_objs( used_run_cache, remote=remote, force=force, jobs=jobs ).items(): used[odb].update(objs) return used @property def stages(self): # obsolete, only for backward-compatibility return self.index.stages def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): # using `outs_graph` to ensure graph checks are run outs = outs or self.index.outs_graph abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) match = path_info.__eq__ if strict else path_info.isin_or_eq def func(out): if out.scheme == "local" and match(out.path_info): return True if recursive and out.path_info.isin(path_info): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @cached_property def dvcfs(self): from dvc.fs.dvc import DvcFileSystem return DvcFileSystem(repo=self) @cached_property def repo_fs(self): from dvc.fs.repo import RepoFileSystem return RepoFileSystem(self, subrepos=self.subrepos, **self._fs_conf) @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" from dvc.fs.repo import RepoFileSystem fs = RepoFileSystem(self, subrepos=True) path = PathInfo(self.root_dir) / path try: with fs.open( path, mode=mode, encoding=encoding, remote=remote ) as fobj: yield fobj except FileNotFoundError as exc: raise FileMissingError(path) from exc except IsADirectoryError as exc: raise DvcIsADirectoryError(f"'{path}' is a directory") from exc def close(self): self.scm.close() self.state.close() def _reset(self): self.state.close() self.scm._reset() # pylint: disable=protected-access self.__dict__.pop("index", None) self.__dict__.pop("dvcignore", None) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self._reset() self.scm.close()
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs.local import LocalFileSystem from dvc.lock import LockNoop, make_lock from dvc.machine import MachineManager from dvc.objects.db import ODBManager from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} if rev and not scm: scm = SCM(root_dir or os.curdir) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) if scm: self._fs = scm.get_fs(rev) else: self._fs = LocalFileSystem(url=self.root_dir) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized self._scm = scm # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.root_dir, self.tmp_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) if self.tmp_dir and ( self.config["feature"].get("machine", False) or env2bool("DVC_TEST") ): self.machine = MachineManager(self) else: self.machine = None self.stage_collection_error_handler: Optional[ Callable[[str, Exception], None] ] = None self._lock_depth = 0
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.tree.local import LocalRemoteTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = scm.get_tree(rev, use_dvcignore=True, dvcignore_root=self.root_dir) self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = LocalRemoteTree( self, {"url": self.root_dir}, use_dvcignore=True, dvcignore_root=self.root_dir, ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore()
def __init__( self, root_dir=None, fs=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.config import Config from dvc.data.db import ODBManager from dvc.data_cloud import DataCloud from dvc.fs.git import GitFileSystem from dvc.fs.local import localfs from dvc.lock import LockNoop, make_lock from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} self._fs = fs or localfs self._scm = None if rev and not fs: self._scm = SCM(root_dir or os.curdir) self._fs = GitFileSystem(scm=self._scm, rev=rev) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, fs=self.fs, uninitialized=uninitialized) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if isinstance(self.fs, GitFileSystem) or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) state_db_dir = self._get_database_dir("state") self.state = State(self.root_dir, state_db_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0
class Repo: DVC_DIR = ".dvc" from dvc.repo.add import add # type: ignore[misc] from dvc.repo.checkout import checkout # type: ignore[misc] from dvc.repo.commit import commit # type: ignore[misc] from dvc.repo.destroy import destroy # type: ignore[misc] from dvc.repo.diff import diff # type: ignore[misc] from dvc.repo.fetch import fetch # type: ignore[misc] from dvc.repo.freeze import freeze, unfreeze # type: ignore[misc] from dvc.repo.gc import gc # type: ignore[misc] from dvc.repo.get import get as _get # type: ignore[misc] from dvc.repo.get_url import get_url as _get_url # type: ignore[misc] from dvc.repo.imp import imp # type: ignore[misc] from dvc.repo.imp_url import imp_url # type: ignore[misc] from dvc.repo.install import install # type: ignore[misc] from dvc.repo.ls import ls as _ls # type: ignore[misc] from dvc.repo.move import move # type: ignore[misc] from dvc.repo.pull import pull # type: ignore[misc] from dvc.repo.push import push # type: ignore[misc] from dvc.repo.remove import remove # type: ignore[misc] from dvc.repo.reproduce import reproduce # type: ignore[misc] from dvc.repo.run import run # type: ignore[misc] from dvc.repo.status import status # type: ignore[misc] from dvc.repo.update import update # type: ignore[misc] ls = staticmethod(_ls) get = staticmethod(_get) get_url = staticmethod(_get_url) def _get_repo_dirs( self, root_dir: str = None, fs: "FileSystem" = None, uninitialized: bool = False, scm: "Base" = None, ): from dvc.fs import localfs from dvc.scm import SCM, SCMError dvc_dir = None tmp_dir = None try: root_dir = self.find_root(root_dir, fs) fs = fs or localfs dvc_dir = fs.path.join(root_dir, self.DVC_DIR) tmp_dir = fs.path.join(dvc_dir, "tmp") except NotDvcRepoError: if not uninitialized: raise if not scm: try: scm = SCM(root_dir or os.curdir) except SCMError: scm = SCM(os.curdir, no_scm=True) if not fs or not root_dir: root_dir = scm.root_dir assert root_dir return root_dir, dvc_dir, tmp_dir def _get_database_dir(self, db_name): # NOTE: by default, store SQLite-based remote indexes and state's # `links` and `md5s` caches in the repository itself to avoid any # possible state corruption in 'shared cache dir' scenario, but allow # user to override this through config when, say, the repository is # located on a mounted volume — see # https://github.com/iterative/dvc/issues/4420 base_db_dir = self.config.get(db_name, {}).get("dir", None) if not base_db_dir: return self.tmp_dir import hashlib from dvc.utils.fs import makedirs root_dir_hash = hashlib.sha224( self.root_dir.encode("utf-8")).hexdigest() db_dir = os.path.join( base_db_dir, self.DVC_DIR, f"{os.path.basename(self.root_dir)}-{root_dir_hash[0:7]}", ) makedirs(db_dir, exist_ok=True) return db_dir def __init__( self, root_dir=None, fs=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, scm=None, ): from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs import GitFileSystem, localfs from dvc.lock import LockNoop, make_lock from dvc.odbmgr import ODBManager from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc_data.hashfile.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} self._fs = fs or localfs self._scm = scm if rev and not fs: self._scm = scm = SCM(root_dir or os.curdir) root_dir = "/" self._fs = GitFileSystem(scm=self._scm, rev=rev) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, fs=self.fs, uninitialized=uninitialized, scm=scm, ) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized # used by DvcFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if isinstance(self.fs, GitFileSystem) or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) self.tmp_dir = None else: from dvc.utils.fs import makedirs makedirs(self.tmp_dir, exist_ok=True) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) state_db_dir = self._get_database_dir("state") self.state = State(self.root_dir, state_db_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0 def __str__(self): return self.url or self.root_dir @cached_property def index(self): from dvc.repo.index import Index return Index(self) @staticmethod def open(url, *args, **kwargs): if url is None: url = os.getcwd() if os.path.exists(url): try: return Repo(url, *args, **kwargs) except NotDvcRepoError: pass # fallthrough to external_repo from dvc.external_repo import external_repo return external_repo(url, *args, **kwargs) @cached_property def scm(self): from dvc.scm import SCM, SCMError if self._scm: return self._scm no_scm = self.config["core"].get("no_scm", False) try: return SCM(self.root_dir, no_scm=no_scm) except SCMError: if self._uninitialized: # might not be a git/dvc repo at all # used in `params/metrics/plots/live` targets return SCM(self.root_dir, no_scm=True) raise @cached_property def scm_context(self) -> "SCMContext": from dvc.repo.scm_context import SCMContext return SCMContext(self.scm, self.config) @cached_property def dvcignore(self) -> DvcIgnoreFilter: return DvcIgnoreFilter(self.fs, self.root_dir) def get_rev(self): from dvc.fs import LocalFileSystem assert self.scm if isinstance(self.fs, LocalFileSystem): from dvc.scm import map_scm_exception with map_scm_exception(): return self.scm.get_rev() return self.fs.rev @cached_property def experiments(self): from dvc.repo.experiments import Experiments return Experiments(self) @cached_property def machine(self): from dvc.machine import MachineManager if self.tmp_dir and (self.config["feature"].get("machine", False) or env2bool("DVC_TEST")): return MachineManager(self) return None @property def fs(self) -> "FileSystem": return self._fs @fs.setter def fs(self, fs: "FileSystem"): self._fs = fs # Our graph cache is no longer valid, as it was based on the previous # fs. self._reset() def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" @classmethod def find_root(cls, root=None, fs=None) -> str: from dvc.fs import LocalFileSystem, localfs fs = fs or localfs root = root or os.curdir root_dir = fs.path.realpath(root) if not fs.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") while True: dvc_dir = fs.path.join(root_dir, cls.DVC_DIR) if fs.isdir(dvc_dir): return root_dir if isinstance(fs, LocalFileSystem) and os.path.ismount(root_dir): break parent = fs.path.parent(root_dir) if parent == root_dir: break root_dir = parent msg = "you are not inside of a DVC repository" if isinstance(fs, LocalFileSystem): msg = f"{msg} (checked up to mount point '{root_dir}')" raise NotDvcRepoError(msg) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init return init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir) def unprotect(self, target): return self.odb.local.unprotect(target) def _ignore(self): flist = [self.config.files["local"], self.tmp_dir] if path_isin(self.odb.local.cache_dir, self.root_dir): flist += [self.odb.local.cache_dir] for file in flist: self.scm_context.ignore(file) def brancher(self, *args, **kwargs): from dvc.repo.brancher import brancher return brancher(self, *args, **kwargs) def used_objs( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits`/`all_experiments` to expand the scope. Returns: A dict mapping (remote) ODB instances to sets of objects that belong to each ODB. If the ODB instance is None, the objects are naive and do not belong to a specific remote ODB. """ used = defaultdict(set) for _ in self.brancher( revs=revs, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, ): for odb, objs in self.index.used_objs( targets, remote=remote, force=force, jobs=jobs, recursive=recursive, with_deps=with_deps, ).items(): used[odb].update(objs) if used_run_cache: for odb, objs in self.stage_cache.get_used_objs(used_run_cache, remote=remote, force=force, jobs=jobs).items(): used[odb].update(objs) return used @property def stages(self): # obsolete, only for backward-compatibility return self.index.stages def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): # using `outs_graph` to ensure graph checks are run outs = outs or self.index.outs_graph abs_path = self.fs.path.abspath(path) fs_path = abs_path def func(out): def eq(one, two): return one == two match = eq if strict else out.fs.path.isin_or_eq if out.protocol == "local" and match(fs_path, out.fs_path): return True if recursive and out.fs.path.isin(out.fs_path, fs_path): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def is_dvc_internal(self, path): path_parts = self.fs.path.normpath(path).split(self.fs.sep) return self.DVC_DIR in path_parts @cached_property def datafs(self): from dvc.fs.data import DataFileSystem return DataFileSystem(repo=self) @cached_property def dvcfs(self): from dvc.fs.dvc import DvcFileSystem return DvcFileSystem(repo=self, subrepos=self.subrepos, **self._fs_conf) @cached_property def index_db_dir(self): return self._get_database_dir("index") @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" from dvc.fs.data import DataFileSystem from dvc.fs.dvc import DvcFileSystem if os.path.isabs(path): fs = DataFileSystem(repo=self, workspace="local") fs_path = path else: fs = DvcFileSystem(repo=self, subrepos=True) fs_path = fs.from_os_path(path) try: with fs.open( fs_path, mode=mode, encoding=encoding, remote=remote, ) as fobj: yield fobj except FileNotFoundError as exc: raise FileMissingError(path) from exc except IsADirectoryError as exc: raise DvcIsADirectoryError(f"'{path}' is a directory") from exc def close(self): self.scm.close() self.state.close() def _reset(self): self.state.close() self.scm._reset() # pylint: disable=protected-access self.__dict__.pop("index", None) self.__dict__.pop("dvcignore", None) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self._reset() self.scm.close()
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.cache import Cache from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs.local import LocalFileSystem from dvc.lock import LockNoop, make_lock from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = { "repo_factory": repo_factory, } self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized) fs_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self._fs = scm.get_fs(rev, **fs_kwargs) else: self._fs = LocalFileSystem(self, {"url": self.root_dir}, **fs_kwargs) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized self._scm = scm # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0
class Repo(object): DVC_DIR = ".dvc" from dvc.repo.destroy import destroy from dvc.repo.install import install from dvc.repo.add import add from dvc.repo.remove import remove from dvc.repo.ls import ls from dvc.repo.lock import lock as lock_stage from dvc.repo.move import move from dvc.repo.run import run from dvc.repo.imp import imp from dvc.repo.imp_url import imp_url from dvc.repo.reproduce import reproduce from dvc.repo.checkout import _checkout from dvc.repo.push import push from dvc.repo.fetch import _fetch from dvc.repo.pull import pull from dvc.repo.status import status from dvc.repo.gc import gc from dvc.repo.commit import commit from dvc.repo.diff import diff from dvc.repo.brancher import brancher from dvc.repo.get import get from dvc.repo.get_url import get_url from dvc.repo.update import update from dvc.repo.plot import plot def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.params = Params(self) self._ignore() @property def tree(self): return self._tree @tree.setter def tree(self, tree): self._tree = tree if isinstance(tree, CleanTree) else CleanTree(tree) # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset() def __repr__(self): return "{}: '{}'".format(self.__class__.__name__, self.root_dir) @classmethod def find_root(cls, root=None): root_dir = os.path.realpath(root or os.curdir) if not os.path.isdir(root_dir): raise NotDvcRepoError("directory '{}' does not exist".format(root)) while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) if os.path.isdir(dvc_dir): return root_dir if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) message = ("you are not inside of a DVC repository " "(checked up to mount point '{}')").format(root_dir) raise NotDvcRepoError(message) @classmethod def find_dvc_dir(cls, root=None): root_dir = cls.find_root(root) return os.path.join(root_dir, cls.DVC_DIR) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): from dvc.repo.init import init init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir) return Repo(root_dir) def unprotect(self, target): return self.cache.local.unprotect(PathInfo(target)) def _ignore(self): flist = [self.config.files["local"], self.tmp_dir] if path_isin(self.cache.local.cache_dir, self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" # Building graph might be costly for the ones with many DVC-files, # so we provide this undocumented hack to skip it. See [1] for # more details. The hack can be used as: # # repo = Repo(...) # repo._skip_graph_checks = True # repo.add(...) # # A user should care about not duplicating outs and not adding cycles, # otherwise DVC might have an undefined behaviour. # # [1] https://github.com/iterative/dvc/issues/2671 if not getattr(self, "_skip_graph_checks", False): self._collect_graph(self.stages + new_stages) def _collect_inside(self, path, graph): import networkx as nx stages = nx.dfs_postorder_nodes(graph) return [stage for stage in stages if path_isin(stage.path, path)] def collect(self, target, with_deps=False, recursive=False, graph=None): import networkx as nx from ..dvcfile import Dvcfile if not target: return list(graph) if graph else self.stages if recursive and os.path.isdir(target): return self._collect_inside(os.path.abspath(target), graph or self.graph) path, name = parse_target(target) dvcfile = Dvcfile(self, path) stages = list(dvcfile.stages.filter(name).values()) if not with_deps: return stages res = set() for stage in stages: pipeline = get_pipeline(get_pipelines(graph or self.graph), stage) res.update(nx.dfs_postorder_nodes(pipeline, stage)) return res def collect_granular(self, target, *args, **kwargs): from ..dvcfile import Dvcfile, is_valid_filename if not target: return [(stage, None) for stage in self.stages] file, name = parse_target(target) if is_valid_filename(file) and not kwargs.get("with_deps"): # Optimization: do not collect the graph for a specific .dvc target stages = Dvcfile(self, file).stages.filter(name) return [(stage, None) for stage in stages.values()] try: (out, ) = self.find_outs_by_path(file, strict=False) filter_info = PathInfo(os.path.abspath(file)) return [(out.stage, filter_info)] except OutputNotFoundError: stages = self.collect(target, *args, **kwargs) return [(stage, None) for stage in stages] def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): targets = targets or [None] pairs = cat( self.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = "({})".format(branch) if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) if used_run_cache: used_cache = self.stage_cache.get_used_cache( used_run_cache, remote=remote, force=force, jobs=jobs, ) cache.update(used_cache) return cache def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = list(n.value for n in outs.prefixes(dep_key)) if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G @cached_property def graph(self): return self._collect_graph(self.stages) @cached_property def pipelines(self): return get_pipelines(self.graph) @cached_property def stages(self): """ Walks down the root directory looking for Dvcfiles, skipping the directories that are related with any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories tracked by DVC (e.g. `dvc add data` would skip `data/`) NOTE: For large repos, this could be an expensive operation. Consider using some memoization. """ return self._collect_stages() @cached_property def plot_templates(self): from dvc.repo.plot.template import PlotTemplates return PlotTemplates(self.dvc_dir) def _collect_stages(self): from dvc.dvcfile import Dvcfile, is_valid_filename stages = [] outs = set() for root, dirs, files in self.tree.walk(self.root_dir): for file_name in filter(is_valid_filename, files): path = os.path.join(root, file_name) stage_loader = Dvcfile(self, path).stages stages.extend(stage_loader.values()) outs.update(out.fspath for stage in stages for out in (out for out in stage.outs if out.scheme == "local")) dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs] return stages def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): if not outs: outs = [out for stage in self.stages for out in stage.outs] abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) match = path_info.__eq__ if strict else path_info.isin_or_eq def func(out): if out.scheme == "local" and match(out.path_info): return True if recursive and out.path_info.isin(path_info): return True return False matched = list(filter(func, outs)) if not matched: raise OutputNotFoundError(path, self) return matched def find_out_by_relpath(self, relpath): path = os.path.join(self.root_dir, relpath) (out, ) = self.find_outs_by_path(path) return out def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" cause = None try: out = self.find_out_by_relpath(path) except OutputNotFoundError as exc: out = None cause = exc if out and out.use_cache: try: with self._open_cached(out, remote, mode, encoding) as fd: yield fd return except FileNotFoundError as exc: raise FileMissingError(path) from exc abs_path = os.path.join(self.root_dir, path) if os.path.exists(abs_path): with open(abs_path, mode=mode, encoding=encoding) as fd: yield fd return raise FileMissingError(path) from cause def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): raise IsADirectoryError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) return open(cache_file, mode=mode, encoding=encoding) def close(self): self.scm.close() @locked def checkout(self, *args, **kwargs): return self._checkout(*args, **kwargs) @locked def fetch(self, *args, **kwargs): return self._fetch(*args, **kwargs) def _reset(self): self.__dict__.pop("graph", None) self.__dict__.pop("stages", None) self.__dict__.pop("pipelines", None) self.__dict__.pop("dvcignore", None)