def get_used_cache(self, *args, **kwargs): from dvc.objects.db import NamedCache cache = NamedCache() for out in self.filter_outs(kwargs.get("filter_info")): cache.update(out.get_used_cache(*args, **kwargs)) return cache
def get_used_cache(self, used_run_cache, *args, **kwargs): from dvc.objects.db import NamedCache cache = NamedCache() for key, value in used_run_cache: entry = self._load_cache(key, value) if not entry: continue stage = self._create_stage(entry) cache.update(stage.get_used_cache(*args, **kwargs)) return cache
def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits`/`all_experiments` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.objects.db import NamedCache cache = NamedCache() for branch in self.brancher( revs=revs, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, ): targets = targets or [None] pairs = cat( self.stage.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = f"({branch})" if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) if used_run_cache: used_cache = self.stage_cache.get_used_cache( used_run_cache, remote=remote, force=force, jobs=jobs, ) cache.update(used_cache) return cache
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # `all_experiments` or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, all_experiments=all_experiments, ) from contextlib import ExitStack from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, remote=remote, force=force, jobs=jobs, )) for scheme, odb in self.odb.by_scheme(): if not odb: continue removed = odb.gc(set(used.scheme_keys(scheme)), jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return remote = self.cloud.get_remote(remote, "gc -c") removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs) if not removed: logger.info("No unused cache to remove from remote.")