Example #1
0
    def get_used_cache(self, *args, **kwargs):
        from dvc.objects.db import NamedCache

        cache = NamedCache()
        for out in self.filter_outs(kwargs.get("filter_info")):
            cache.update(out.get_used_cache(*args, **kwargs))

        return cache
Example #2
0
    def collect_used_dir_cache(
        self, remote=None, force=False, jobs=None, filter_info=None
    ):
        """Get a list of `info`s related to the given directory.

        - Pull the directory entry from the remote cache if it was changed.

        Example:

            Given the following commands:

            $ echo "foo" > directory/foo
            $ echo "bar" > directory/bar
            $ dvc add directory

            It will return a NamedCache like:

            nc = NamedCache()
            nc.add(self.scheme, 'c157a79031e1', 'directory/foo')
            nc.add(self.scheme, 'd3b07384d113', 'directory/bar')
        """

        cache = NamedCache()

        try:
            self.get_dir_cache(jobs=jobs, remote=remote)
        except DvcException:
            logger.debug(f"failed to pull cache for '{self}'")

        try:
            objects.check(self.odb, self.odb.get(self.hash_info))
        except (FileNotFoundError, ObjectFormatError):
            msg = (
                "Missing cache for directory '{}'. "
                "Cache for files inside will be lost. "
                "Would you like to continue? Use '-f' to force."
            )
            if not force and not prompt.confirm(msg.format(self.path_info)):
                raise CollectCacheError(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(self)
                )
            return cache

        path = str(self.path_info)
        filter_path = str(filter_info) if filter_info else None
        for entry_key, entry_obj in self.obj:
            entry_path = os.path.join(path, *entry_key)
            if (
                not filter_path
                or entry_path == filter_path
                or entry_path.startswith(filter_path + os.sep)
            ):
                cache.add(self.scheme, entry_obj.hash_info.value, entry_path)

        return cache
Example #3
0
    def get_used_cache(self, used_run_cache, *args, **kwargs):
        from dvc.objects.db import NamedCache

        cache = NamedCache()

        for key, value in used_run_cache:
            entry = self._load_cache(key, value)
            if not entry:
                continue
            stage = self._create_stage(entry)
            cache.update(stage.get_used_cache(*args, **kwargs))
        return cache
Example #4
0
    def get_used_cache(self, **kwargs):
        """Get a dumpd of the given `out`, with an entry including the branch.

        The `used_cache` of an output is no more than its `info`.

        In case that the given output is a directory, it will also
        include the `info` of its files.
        """

        if not self.use_cache:
            return NamedCache()

        if self.stage.is_repo_import:
            cache = NamedCache()
            (dep,) = self.stage.deps
            cache.external[dep.repo_pair].add(dep.def_path)
            return cache

        if not self.hash_info:
            msg = (
                "Output '{}'({}) is missing version info. "
                "Cache for it will not be collected. "
                "Use `dvc repro` to get your pipeline up to date.".format(
                    self, self.stage
                )
            )
            if self.exists:
                msg += (
                    "\n"
                    "You can also use `dvc commit {stage.addressing}` "
                    "to associate existing '{out}' with {stage}.".format(
                        out=self, stage=self.stage
                    )
                )
            logger.warning(msg)
            return NamedCache()

        ret = NamedCache.make(self.scheme, self.hash_info.value, str(self))

        if not self.is_dir_checksum:
            return ret

        ret.add_child_cache(
            self.hash_info.value, self.collect_used_dir_cache(**kwargs)
        )

        return ret
Example #5
0
def test_used_cache(tmp_dir, dvc, path):
    from dvc.objects.db import NamedCache

    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}})
    expected = NamedCache.make("local", "70922d6bf66eb073053a82f77d58c536.dir",
                               "dir")
    expected.add_child_cache(
        "70922d6bf66eb073053a82f77d58c536.dir",
        NamedCache.make(
            "local",
            "8c7dd922ad47494fc02c388e12c00eac",
            os.path.join("dir", "subdir", "file"),
        ),
    )

    used_cache = dvc.used_cache([path])
    assert (used_cache._items == expected._items
            and used_cache.external == expected.external)
Example #6
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
    And the desired files to fetch are already on the local cache,
    Don't check the existence of the desired files on the remote cache
    """
    odb = LocalObjectDB(LocalFileSystem())

    infos = NamedCache()
    infos.add("local", "acbd18db4cc2f85cedef654fccc4a4d8", "foo")
    infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar")

    local_exists = list(infos["local"])
    mocker.patch.object(odb, "hashes_exist", return_value=local_exists)

    other_remote = mocker.Mock()
    other_remote.url = "other_remote"
    other_remote.hashes_exist.return_value = []
    other_remote.index = RemoteIndexNoop()

    other_remote.status(odb, infos, download=True)

    assert other_remote.hashes_exist.call_count == 0
Example #7
0
    def get_dir_cache(self, **kwargs):

        if not self.is_dir_checksum:
            raise DvcException("cannot get dir cache for file checksum")

        try:
            objects.check(self.odb, self.odb.get(self.hash_info))
        except (FileNotFoundError, ObjectFormatError):
            self.repo.cloud.pull(
                NamedCache.make("local", self.hash_info.value, str(self)),
                show_checksums=False,
                **kwargs,
            )

        try:
            self.obj = objects.load(self.odb, self.hash_info)
        except (FileNotFoundError, ObjectFormatError):
            self.obj = None

        return self.obj
Example #8
0
    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        all_experiments=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
        used_run_cache=None,
        revs=None,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches`/`all_tags`/`all_commits`/`all_experiments` to expand
        the scope.

        Returns:
            A dictionary with Schemes (representing output's location) mapped
            to items containing the output's `dumpd` names and the output's
            children (if the given output is a directory).
        """
        from dvc.objects.db import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
                revs=revs,
                all_branches=all_branches,
                all_tags=all_tags,
                all_commits=all_commits,
                all_experiments=all_experiments,
        ):
            targets = targets or [None]

            pairs = cat(
                self.stage.collect_granular(
                    target, recursive=recursive, with_deps=with_deps)
                for target in targets)

            suffix = f"({branch})" if branch else ""
            for stage, filter_info in pairs:
                used_cache = stage.get_used_cache(
                    remote=remote,
                    force=force,
                    jobs=jobs,
                    filter_info=filter_info,
                )
                cache.update(used_cache, suffix=suffix)

        if used_run_cache:
            used_cache = self.stage_cache.get_used_cache(
                used_run_cache,
                remote=remote,
                force=force,
                jobs=jobs,
            )
            cache.update(used_cache)

        return cache
Example #9
0
def test_cloud(tmp_dir, dvc, remote):  # pylint:disable=unused-argument
    (stage,) = tmp_dir.dvc_gen("foo", "foo")
    out = stage.outs[0]
    cache = out.cache_path
    md5 = out.hash_info.value
    info = out.get_used_cache()

    (stage_dir,) = tmp_dir.dvc_gen(
        {
            "data_dir": {
                "data_sub_dir": {"data_sub": "data_sub"},
                "data": "data",
                "empty": "",
            }
        }
    )
    out_dir = stage_dir.outs[0]
    cache_dir = out_dir.cache_path
    name_dir = str(out_dir)
    md5_dir = out_dir.hash_info.value
    info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir)

    # Check status
    status = dvc.cloud.status(info, show_checksums=True)
    expected = {md5: {"name": md5, "status": STATUS_NEW}}
    assert status == expected

    status_dir = dvc.cloud.status(info_dir, show_checksums=True)
    expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
    assert status_dir == expected

    # Move cache and check status
    # See issue https://github.com/iterative/dvc/issues/4383 for details
    backup_dir = dvc.odb.local.cache_dir + ".backup"
    move(dvc.odb.local.cache_dir, backup_dir)
    status = dvc.cloud.status(info, show_checksums=True)
    expected = {md5: {"name": md5, "status": STATUS_MISSING}}
    assert status == expected

    status_dir = dvc.cloud.status(info_dir, show_checksums=True)
    expected = {md5_dir: {"name": md5_dir, "status": STATUS_MISSING}}
    assert status_dir == expected

    # Restore original cache:
    remove(dvc.odb.local.cache_dir)
    move(backup_dir, dvc.odb.local.cache_dir)

    # Push and check status
    dvc.cloud.push(info)
    assert os.path.exists(cache)
    assert os.path.isfile(cache)

    dvc.cloud.push(info_dir)
    assert os.path.isfile(cache_dir)

    status = dvc.cloud.status(info, show_checksums=True)
    expected = {md5: {"name": md5, "status": STATUS_OK}}
    assert status == expected

    status_dir = dvc.cloud.status(info_dir, show_checksums=True)
    expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
    assert status_dir == expected

    # Remove and check status
    remove(dvc.odb.local.cache_dir)

    status = dvc.cloud.status(info, show_checksums=True)
    expected = {md5: {"name": md5, "status": STATUS_DELETED}}
    assert status == expected

    status_dir = dvc.cloud.status(info_dir, show_checksums=True)
    expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
    assert status_dir == expected

    # Pull and check status
    dvc.cloud.pull(info)
    assert os.path.exists(cache)
    assert os.path.isfile(cache)
    with open(cache) as fd:
        assert fd.read() == "foo"

    dvc.cloud.pull(info_dir)
    assert os.path.isfile(cache_dir)

    status = dvc.cloud.status(info, show_checksums=True)
    expected = {md5: {"name": md5, "status": STATUS_OK}}
    assert status == expected

    status_dir = dvc.cloud.status(info_dir, show_checksums=True)
    expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
    assert status_dir == expected
Example #10
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    all_experiments=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # `all_experiments` or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
        all_experiments=all_experiments,
    )

    from contextlib import ExitStack

    from dvc.repo import Repo

    if not repos:
        repos = []
    all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    all_experiments=all_experiments,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    for scheme, odb in self.odb.by_scheme():
        if not odb:
            continue

        removed = odb.gc(set(used.scheme_keys(scheme)), jobs=jobs)
        if not removed:
            logger.info(f"No unused '{scheme}' cache to remove.")

    if not cloud:
        return

    remote = self.cloud.get_remote(remote, "gc -c")
    removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs)
    if not removed:
        logger.info("No unused cache to remove from remote.")