Example #1
0
 def tearDown(self):
     self._popd()
     try:
         remove(self._root_dir)
     except OSError as exc:
         # We ignore this under Windows with a warning because it happened
         # to be really hard to trace all not properly closed files.
         #
         # Best guess so far is that gitpython is the culprit:
         # it opens files and uses __del__ to close them, which can happen
         # late in current pythons. TestGitFixture and TestDvcFixture try
         # to close that and it works on most of the tests, but not all.
         # Repos and thus git repos are created all over the dvc ;)
         if os.name == "nt" and exc.winerror == 32:
             warnings.warn("Failed to remove test dir: " + str(exc))
         else:
             raise
Example #2
0
File: get.py Project: woodshop/dvc
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        try:
            with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
                # Try any links possible to avoid data duplication.
                #
                # Not using symlink, because we need to remove cache after we
                # are done, and to make that work we would have to copy data
                # over anyway before removing the cache, so we might just copy
                # it right away.
                #
                # Also, we can't use theoretical "move" link type here, because
                # the same cache file might be used a few times in a directory.
                repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]
                output = repo.find_out_by_relpath(path)
                if output.use_cache:
                    _get_cached(repo, output, out)
                    return
                # Non-cached output, fall through and try to copy from git.
        except (NotDvcRepoError, NoOutputInExternalRepoError):
            # Not a DVC repository or, possibly, path is not tracked by DVC.
            # Fall through and try to copy from git.
            pass

        if os.path.isabs(path):
            raise FileNotFoundError

        repo_dir = cached_clone(url, rev=rev)

        fs_copy(os.path.join(repo_dir, path), out)
    except (OutputNotFoundError, FileNotFoundError):
        raise PathMissingError(path, url)
    finally:
        remove(tmp_dir)
Example #3
0
def test_pull_no_rev_lock(erepo_dir, tmp_dir, dvc):
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("foo", "contents", commit="create foo")

    stage = dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported")
    assert "rev" not in stage.deps[0].def_repo
    stage.deps[0].def_repo.pop("rev_lock")

    Dvcfile(dvc, stage.path).dump(stage)

    remove(stage.outs[0].cache_path)
    (tmp_dir / "foo_imported").unlink()

    dvc.pull([stage.path])

    assert (tmp_dir / "foo_imported").is_file()
    assert (tmp_dir / "foo_imported").read_text() == "contents"
Example #4
0
File: get.py Project: ptrcklv/dvc
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            try:
                output = repo.find_out_by_relpath(path)
            except OutputNotFoundError:
                output = None

            if output and output.use_cache:
                _get_cached(repo, output, out)
            else:
                # Either an uncached out with absolute path or a user error
                if os.path.isabs(path):
                    raise FileNotFoundError

                _copy(os.path.join(repo.root_dir, path), out)

    except (OutputNotFoundError, FileNotFoundError):
        raise PathMissingError(path, url)
    except NotDvcRepoError:
        raise UrlNotDvcRepoError(url)
    finally:
        remove(tmp_dir)
Example #5
0
def test_push_incomplete_dir(tmp_dir, dvc, mocker, local_remote):
    (stage, ) = tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    remote = dvc.cloud.get_remote("upstream")

    cache = dvc.cache.local
    dir_hash = stage.outs[0].checksum
    used = stage.get_used_cache(remote=remote)

    # remove one of the cache files for directory
    file_hashes = list(used.child_keys(cache.tree.scheme, dir_hash))
    remove(cache.tree.hash_to_path_info(file_hashes[0]))

    dvc.push()
    assert not remote.tree.exists(remote.tree.hash_to_path_info(dir_hash))
    assert not remote.tree.exists(remote.tree.hash_to_path_info(
        file_hashes[0]))
    assert remote.tree.exists(remote.tree.hash_to_path_info(file_hashes[1]))
Example #6
0
def test_pull_non_workspace(tmp_dir, scm, dvc, erepo_dir):
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("foo", "master content", commit="create foo")

        with erepo_dir.branch("branch", new=True):
            erepo_dir.dvc_gen("foo", "branch content", commit="modify foo")

    stage = dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported", rev="branch")
    tmp_dir.scm_add([stage.relpath], commit="imported branch")
    scm.tag("ref-to-branch")

    # Overwrite via import
    dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported", rev="master")

    remove(stage.outs[0].cache_path)
    dvc.fetch(all_tags=True)
    assert os.path.exists(stage.outs[0].cache_path)
Example #7
0
def test_git_ssh(tmp_dir, scm, server):
    from dulwich.repo import Repo as DulwichRepo
    from sshfs import SSHFileSystem

    from dvc.utils.fs import remove
    from tests.remotes.ssh import TEST_SSH_KEY_PATH, TEST_SSH_USER

    fs = SSHFileSystem(
        host=server.host,
        port=server.port,
        username=TEST_SSH_USER,
        client_keys=[TEST_SSH_KEY_PATH],
    )
    server._ssh.execute("git init --bare test-repo.git")
    url = f"ssh://{TEST_SSH_USER}@{server.host}:{server.port}/~/test-repo.git"

    tmp_dir.scm_gen("foo", "foo", commit="init")
    rev = scm.get_rev()

    scm.push_refspec(
        url,
        "refs/heads/master",
        "refs/heads/master",
        force=True,
        key_filename=TEST_SSH_KEY_PATH,
    )

    assert (
        rev.encode("ascii")
        == fs.open("test-repo.git/refs/heads/master").read().strip()
    )

    remove(tmp_dir / ".git")
    remove(tmp_dir / "foo")
    DulwichRepo.init(str(tmp_dir))

    scm.fetch_refspecs(
        url,
        ["refs/heads/master"],
        force=True,
        key_filename=TEST_SSH_KEY_PATH,
    )
    assert rev == scm.get_ref("refs/heads/master")
    scm.checkout("master")
    assert "foo" == (tmp_dir / "foo").read_text()
Example #8
0
def test_subrepos_are_ignored(tmp_dir, erepo_dir):
    subrepo = erepo_dir / "dir" / "subrepo"
    make_subrepo(subrepo, erepo_dir.scm)
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("dir/foo", "foo", commit="foo")
        erepo_dir.scm_gen("dir/bar", "bar", commit="bar")

    with subrepo.chdir():
        subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo")

    with external_repo(os.fspath(erepo_dir)) as repo:
        repo.dvcfs.get(
            "dir",
            os.fspath(tmp_dir / "out"),
        )
        expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"}
        assert (tmp_dir / "out").read_text() == expected_files

        # clear cache to test saving to cache
        cache_dir = tmp_dir / repo.odb.local.cache_dir
        remove(cache_dir)
        clean_staging()
        makedirs(cache_dir)

        staging, _, obj = stage(
            repo.odb.local,
            "dir",
            repo.dvcfs,
            "md5",
            ignore=repo.dvcignore,
        )
        transfer(
            staging,
            repo.odb.local,
            {obj.hash_info},
            shallow=False,
            hardlink=True,
        )
        assert set(cache_dir.glob("??/*")) == {
            cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7",
            cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir",
            cache_dir / "37" / "b51d194a7513e45b56f6524f2d51f2",
            cache_dir / "94" / "7d2b84e5aa88170e80dff467a5bfb6",
            cache_dir / "ac" / "bd18db4cc2f85cedef654fccc4a4d8",
        }
Example #9
0
    def checkout_exp(self, rev, **kwargs):
        """Checkout an experiment to the user's workspace."""
        from git.exc import GitCommandError

        from dvc.repo.checkout import checkout as dvc_checkout

        baseline_rev = self._check_baseline(rev)
        self._scm_checkout(rev)

        branch = self._get_branch_containing(rev)
        m = self.BRANCH_RE.match(branch) if branch else None
        if m and m.group("checkpoint"):
            kwargs.update({"allow_missing": True, "quiet": True})

        tmp = tempfile.NamedTemporaryFile(delete=False).name
        self.scm.repo.head.commit.diff(baseline_rev,
                                       patch=True,
                                       full_index=True,
                                       binary=True,
                                       output=tmp)

        dirty = self.repo.scm.is_dirty(untracked_files=True)
        if dirty:
            logger.debug("Stashing workspace changes.")
            self.repo.scm.repo.git.stash("push", "--include-untracked")

        try:
            if os.path.getsize(tmp):
                logger.debug("Patching local workspace")
                self.repo.scm.repo.git.apply(tmp, reverse=True)
                need_checkout = True
            else:
                need_checkout = False
        except GitCommandError:
            raise DvcException("failed to apply experiment changes.")
        finally:
            remove(tmp)
            if dirty:
                self._unstash_workspace()
            args_file = os.path.join(self.repo.tmp_dir, self.PACKED_ARGS_FILE)
            if os.path.exists(args_file):
                remove(args_file)

        if need_checkout:
            dvc_checkout(self.repo, **kwargs)
Example #10
0
    def _pull_cached(self, out, path_info, dest):
        with self.state:
            tmp = PathInfo(tmp_fname(dest))
            src = tmp / path_info.relative_to(out.path_info)

            out.path_info = tmp

            # Only pull unless all needed cache is present
            if out.changed_cache(filter_info=src):
                self.cloud.pull(out.get_used_cache(filter_info=src))

            failed = out.checkout(filter_info=src)

            move(src, dest)
            remove(tmp)

            if failed:
                raise FileNotFoundError
Example #11
0
 def _stash_exp(self, *args, **kwargs):
     """Stash changes from the current (parent) workspace as an experiment.
     """
     rev = self.scm.get_rev()
     tmp = tempfile.NamedTemporaryFile(delete=False).name
     try:
         self.repo.scm.repo.git.diff(patch=True, output=tmp)
         if os.path.getsize(tmp):
             logger.debug("Patching experiment workspace")
             self.scm.repo.git.apply(tmp)
         else:
             raise UnchangedExperimentError(rev)
     finally:
         remove(tmp)
     self._pack_args(*args, **kwargs)
     msg = f"{self.STASH_MSG_PREFIX}{rev}"
     self.scm.repo.git.stash("push", "-m", msg)
     return self.scm.resolve_rev("stash@{0}")
Example #12
0
def test_missing_cache(tmp_dir, dvc, run_copy_metrics):
    tmp_dir.gen("metrics_t.yaml", "1.1")
    run_copy_metrics(
        "metrics_t.yaml",
        "metrics.yaml",
        metrics=["metrics.yaml"],
    )

    # This one should be skipped
    stage = run_copy_metrics(
        "metrics_t.yaml",
        "metrics2.yaml",
        metrics=["metrics2.yaml"],
    )
    remove(stage.outs[0].fspath)
    remove(stage.outs[0].cache_path)

    assert dvc.metrics.show() == {"": {"metrics.yaml": 1.1}}
Example #13
0
def test_diff_no_cache(tmp_dir, scm, dvc):
    tmp_dir.dvc_gen({"dir": {"file": "file content"}}, commit="first")
    scm.tag("v1")

    tmp_dir.dvc_gen(
        {"dir": {"file": "modified file content"}}, commit="second"
    )
    scm.tag("v2")

    remove(dvc.cache.local.cache_dir)

    # invalidate_dir_info to force cache loading
    dvc.cache.local._dir_info = {}

    diff = dvc.diff("v1", "v2")
    assert diff["added"] == []
    assert diff["deleted"] == []
    assert first(diff["modified"])["path"] == os.path.join("dir", "")
Example #14
0
def test_pipeline_file_target_ops(tmp_dir, dvc, run_copy, local_remote):
    path = local_remote.url
    tmp_dir.dvc_gen("foo", "foo")
    run_copy("foo", "bar", single_stage=True)

    tmp_dir.dvc_gen("lorem", "lorem")
    run_copy("lorem", "lorem2", name="copy-lorem-lorem2")

    tmp_dir.dvc_gen("ipsum", "ipsum")
    run_copy("ipsum", "baz", name="copy-ipsum-baz")

    outs = ["foo", "bar", "lorem", "ipsum", "baz", "lorem2"]

    remove(dvc.stage_cache.cache_dir)

    dvc.push()

    outs = ["foo", "bar", "lorem", "ipsum", "baz", "lorem2"]

    # each one's a copy of other, hence 3
    assert len(recurse_list_dir(path)) == 3

    clean(outs, dvc)
    assert set(dvc.pull(["dvc.yaml"])["added"]) == {"lorem2", "baz"}

    clean(outs, dvc)
    assert set(dvc.pull()["added"]) == set(outs)

    # clean everything in remote and push
    from tests.dir_helpers import TmpDir

    clean(TmpDir(path).iterdir())
    dvc.push(["dvc.yaml:copy-ipsum-baz"])
    assert len(recurse_list_dir(path)) == 1

    clean(TmpDir(path).iterdir())
    dvc.push(["dvc.yaml"])
    assert len(recurse_list_dir(path)) == 2

    with pytest.raises(StageNotFound):
        dvc.push(["dvc.yaml:StageThatDoesNotExist"])

    with pytest.raises(StageNotFound):
        dvc.pull(["dvc.yaml:StageThatDoesNotExist"])
Example #15
0
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Note: we need to replace state, because in case of getting DVC
            # dependency on CIFS or NFS filesystems, sqlite-based state
            # will be unable to obtain lock
            repo.state = StateNoop()

            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            o = repo.find_out_by_relpath(path)
            with repo.state:
                repo.cloud.pull(o.get_used_cache())
            o.path_info = PathInfo(os.path.abspath(out))
            with o.repo.state:
                o.checkout()

    except NotDvcRepoError:
        raise UrlNotDvcRepoError(url)
    except OutputNotFoundError:
        raise OutputNotFoundError(path)
    finally:
        remove(tmp_dir)
Example #16
0
def test_rename_multiple_files_same_hashes(tmp_dir, scm, dvc):
    """Test diff by renaming >=2 instances of file with same hashes.

    DVC should be able to detect that they are renames, and should not include
    them in either of the `added` or the `deleted` section.
    """
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "subdir": {"foo": "foo"}}}, commit="commit #1"
    )
    remove(tmp_dir / "dir")
    # changing foo and subdir/foo to bar and subdir/bar respectively
    tmp_dir.dvc_gen(
        {"dir": {"bar": "foo", "subdir": {"bar": "foo"}}}, commit="commit #2"
    )
    assert dvc.diff("HEAD~") == {
        "added": [],
        "deleted": [],
        "modified": [
            {
                "hash": {
                    "new": "31b36b3ea5f4485e27f10578c47183b0.dir",
                    "old": "c7684c8b3b0d28cf80d5305e2d856bfc.dir",
                },
                "path": os.path.join("dir", ""),
            }
        ],
        "not in cache": [],
        "renamed": [
            {
                "hash": "acbd18db4cc2f85cedef654fccc4a4d8",
                "path": {
                    "new": os.path.join("dir", "bar"),
                    "old": os.path.join("dir", "foo"),
                },
            },
            {
                "hash": "acbd18db4cc2f85cedef654fccc4a4d8",
                "path": {
                    "new": os.path.join("dir", "subdir", "bar"),
                    "old": os.path.join("dir", "subdir", "foo"),
                },
            },
        ],
    }
Example #17
0
def test_subrepos_are_ignored(tmp_dir, erepo_dir):
    subrepo = erepo_dir / "dir" / "subrepo"
    make_subrepo(subrepo, erepo_dir.scm)
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("dir/foo", "foo", commit="foo")
        erepo_dir.scm_gen("dir/bar", "bar", commit="bar")

    with subrepo.chdir():
        subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo")

    with external_repo(os.fspath(erepo_dir)) as repo:
        repo.repo_tree.download(
            PathInfo(repo.root_dir) / "dir",
            PathInfo(tmp_dir / "out"),
            follow_subrepos=False,
        )
        expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"}
        assert (tmp_dir / "out").read_text() == expected_files

        # clear cache to test saving to cache
        cache_dir = tmp_dir / repo.cache.local.cache_dir
        remove(cache_dir)
        makedirs(cache_dir)

        expected_hash = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir")
        assert (
            repo.repo_tree.get_hash(
                os.path.join(repo.root_dir, "dir"), follow_subrepos=False
            )
            == expected_hash
        )

        repo.cache.local.save(
            PathInfo(repo.root_dir) / "dir",
            repo.repo_tree,
            expected_hash,
            link=False,
        )
        assert set(cache_dir.glob("*/*")) == {
            cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir",
            cache_dir / "37" / "b51d194a7513e45b56f6524f2d51f2",
            cache_dir / "94" / "7d2b84e5aa88170e80dff467a5bfb6",
            cache_dir / "ac" / "bd18db4cc2f85cedef654fccc4a4d8",
        }
Example #18
0
def test_checkout_targets_deps(tmp_dir, scm, dvc, exp_stage):
    from dvc.utils.fs import remove

    tmp_dir.dvc_gen({"foo": "foo", "bar": "bar"}, commit="add files")
    stage = dvc.stage.add(
        cmd="python copy.py params.yaml metrics.yaml",
        metrics_no_cache=["metrics.yaml"],
        params=["foo"],
        name="copy-file",
        deps=["copy.py", "foo"],
        force=True,
    )
    remove("foo")
    remove("bar")

    dvc.experiments.run(stage.addressing, params=["foo=2"])
    assert (tmp_dir / "foo").exists()
    assert (tmp_dir / "foo").read_text() == "foo"
    assert not (tmp_dir / "bar").exists()
Example #19
0
    def _unprotect_file(path):
        if System.is_symlink(path) or System.is_hardlink(path):
            logger.debug("Unprotecting '{}'".format(path))
            tmp = os.path.join(os.path.dirname(path), "." + uuid())

            # The operations order is important here - if some application
            # would access the file during the process of copyfile then it
            # would get only the part of file. So, at first, the file should be
            # copied with the temporary name, and then original file should be
            # replaced by new.
            copyfile(path, tmp, name="Unprotecting '{}'".format(relpath(path)))
            remove(path)
            os.rename(tmp, path)

        else:
            logger.debug("Skipping copying for '{}', since it is not "
                         "a symlink or a hardlink.".format(path))

        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
Example #20
0
    def _reproduce(self, executors: dict, jobs: Optional[int] = 1) -> dict:
        """Run dvc repro for the specified ExperimentExecutors in parallel.

        Returns dict containing successfully executed experiments.
        """
        result = {}

        with ProcessPoolExecutor(max_workers=jobs) as workers:
            futures = {}
            for rev, executor in executors.items():
                future = workers.submit(
                    executor.reproduce,
                    executor.dvc_dir,
                    cwd=executor.dvc.root_dir,
                    **executor.repro_kwargs,
                )
                futures[future] = (rev, executor)
            for future in as_completed(futures):
                rev, executor = futures[future]
                exc = future.exception()
                if exc is None:
                    exp_hash = future.result()
                    logger.debug(f"ran exp based on {executor.baseline_rev}")
                    self._scm_checkout(executor.baseline_rev)
                    self._collect_output(executor.baseline_rev, executor)
                    remove(self.args_file)
                    try:
                        exp_rev = self._commit(exp_hash)
                    except UnchangedExperimentError:
                        logger.debug(
                            "Experiment '%s' identical to baseline '%s'",
                            rev,
                            executor.baseline_rev,
                        )
                        exp_rev = executor.baseline_rev
                    logger.info("Reproduced experiment '%s'.", exp_rev[:7])
                    result[rev] = {exp_rev: exp_hash}
                else:
                    logger.exception("Failed to reproduce experiment '%s'",
                                     rev)
                executor.cleanup()

        return result
Example #21
0
def test_api_missing_local_cache_exists_on_remote(
    tmp_dir,
    scm,
    dvc,
    as_external,
    remote,
    files,
    to_read,
):
    tmp_dir.dvc_gen(files, commit="DVC track files")
    dvc.push()

    # Remove cache to make foo missing
    remove(dvc.cache.local.cache_dir)
    remove(first(files))

    repo_url = f"file://{tmp_dir}" if as_external else None
    file_content = get_in(files, to_read.split(os.sep))
    assert api.read(to_read, repo=repo_url) == file_content
Example #22
0
File: apply.py Project: jear/dvc
def apply(repo: "Repo", rev: str, force: bool = True, **kwargs):
    from scmrepo.exceptions import SCMError as _SCMError

    from dvc.repo.checkout import checkout as dvc_checkout
    from dvc.scm import GitMergeError, RevError, resolve_rev

    exps = repo.experiments

    try:
        exp_rev = resolve_rev(repo.scm, rev)
        exps.check_baseline(exp_rev)
    except (RevError, BaselineMismatchError) as exc:
        raise InvalidExpRevError(rev) from exc

    stash_rev = exp_rev in exps.stash_revs
    if not stash_rev and not exps.get_branch_by_rev(exp_rev,
                                                    allow_multiple=True):
        raise InvalidExpRevError(exp_rev)

    # NOTE: we don't use scmrepo's stash_workspace() here since we need
    # finer control over the merge behavior when we unstash everything
    with _apply_workspace(repo, rev, force):
        try:
            repo.scm.merge(exp_rev, commit=False, squash=True)
        except _SCMError as exc:
            raise GitMergeError(str(exc), scm=repo.scm)

    repo.scm.reset()

    if stash_rev:
        args_path = os.path.join(repo.tmp_dir, BaseExecutor.PACKED_ARGS_FILE)
        if os.path.exists(args_path):
            remove(args_path)

    dvc_checkout(repo, **kwargs)

    repo.scm.set_ref(EXEC_APPLY, exp_rev)
    logger.info(
        "Changes for experiment '%s' have been applied to your current "
        "workspace.",
        rev,
    )
Example #23
0
File: get.py Project: baajur/dvc
def get(url, path, out=None, rev=None, jobs=None):
    import shortuuid

    from dvc.dvcfile import is_valid_filename
    from dvc.external_repo import external_repo

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))

    # Try any links possible to avoid data duplication.
    #
    # Not using symlink, because we need to remove cache after we
    # are done, and to make that work we would have to copy data
    # over anyway before removing the cache, so we might just copy
    # it right away.
    #
    # Also, we can't use theoretical "move" link type here, because
    # the same cache file might be used a few times in a directory.
    cache_types = ["reflink", "hardlink", "copy"]
    try:
        with external_repo(url=url,
                           rev=rev,
                           cache_dir=tmp_dir,
                           cache_types=cache_types) as repo:
            from_info = PathInfo(repo.root_dir) / path
            to_info = PathInfo(out)
            repo.repo_tree.download(from_info,
                                    to_info,
                                    jobs=jobs,
                                    follow_subrepos=False)
    finally:
        remove(tmp_dir)
Example #24
0
def test_show_dir_plots(tmp_dir, dvc, run_copy_metrics):
    subdir = tmp_dir / "subdir"
    subdir.mkdir()
    metric = [
        {
            "first_val": 100,
            "val": 2
        },
        {
            "first_val": 200,
            "val": 3
        },
    ]

    fname = "file.json"
    _write_json(tmp_dir, metric, fname)

    p1 = os.path.join("subdir", "p1.json")
    p2 = os.path.join("subdir", "p2.json")
    tmp_dir.dvc.run(
        cmd=(f"mkdir subdir && python copy.py {fname} {p1} && "
             f"python copy.py {fname} {p2}"),
        deps=[fname],
        single_stage=False,
        plots=["subdir"],
        name="copy_double",
    )

    result = dvc.plots.show(targets=["subdir"])
    p1_content = json.loads(result[p1])
    p2_content = json.loads(result[p2])

    assert p1_content == p2_content

    result = dvc.plots.show(targets=[p1])
    assert set(result.keys()) == {p1}

    remove(dvc.odb.local.cache_dir)
    remove(subdir)
    with pytest.raises(NoMetricsParsedError):
        dvc.plots.show()
Example #25
0
def test_open_external(tmp_dir, erepo_dir, cloud):
    erepo_dir.add_remote(config=cloud.config)

    with erepo_dir.chdir():
        erepo_dir.dvc_gen("version", "master", commit="add version")

        with erepo_dir.branch("branch", new="True"):
            # NOTE: need file to be other size for Mac
            erepo_dir.dvc_gen("version", "branchver", commit="add version")

    erepo_dir.dvc.push(all_branches=True)

    # Remove cache to force download
    remove(erepo_dir.dvc.cache.local.cache_dir)

    # Using file url to force clone to tmp repo
    repo_url = f"file://{erepo_dir}"
    with api.open("version", repo=repo_url) as fd:
        assert fd.read() == "master"

    assert api.read("version", repo=repo_url, rev="branch") == "branchver"
Example #26
0
def test_open_external(remote_url, erepo_dir):
    _set_remote_url_and_commit(erepo_dir.dvc, remote_url)

    with erepo_dir.chdir():
        erepo_dir.dvc_gen("version", "master", commit="add version")

        with erepo_dir.branch("branch", new="True"):
            # NOTE: need file to be other size for Mac
            erepo_dir.dvc_gen("version", "branchver", commit="add version")

    erepo_dir.dvc.push(all_branches=True)

    # Remove cache to force download
    remove(erepo_dir.dvc.cache.local.cache_dir)

    # Using file url to force clone to tmp repo
    repo_url = "file://{}".format(erepo_dir)
    with api.open("version", repo=repo_url) as fd:
        assert fd.read() == "master"

    assert api.read("version", repo=repo_url, rev="branch") == "branchver"
Example #27
0
def test_no_cache_entry(tmp_dir, scm, dvc):
    tmp_dir.dvc_gen("file", "first", commit="add a file")

    tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}})
    tmp_dir.dvc_gen("file", "second")

    remove(tmp_dir / ".dvc" / "cache")
    (tmp_dir / ".dvc" / "tmp" / "state").unlink()

    dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir"

    assert dvc.diff() == {
        "added": [{"path": os.path.join("dir", ""), "hash": dir_checksum}],
        "deleted": [],
        "modified": [
            {
                "path": "file",
                "hash": {"old": digest("first"), "new": digest("second")},
            }
        ],
    }
Example #28
0
def test_collect_non_existing_dir(tmp_dir, dvc, run_copy_metrics):
    subdir = tmp_dir / "subdir"
    subdir.mkdir()

    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    subdir_metric = [{"y": 101, "x": 3}, {"y": 202, "x": 4}]

    pname = "source.json"
    (tmp_dir / pname).dump_json(metric, sort_keys=True)

    sname = "subdir_source.json"
    (tmp_dir / sname).dump_json(subdir_metric, sort_keys=True)

    p1 = os.path.join("subdir", "p1.json")
    p2 = os.path.join("subdir", "p2.json")
    subdir_stage = tmp_dir.dvc.run(
        cmd=(
            f"mkdir subdir && python copy.py {sname} {p1} && "
            f"python copy.py {sname} {p2}"
        ),
        deps=[sname],
        single_stage=False,
        plots=["subdir"],
        name="copy_double",
    )

    run_copy_metrics(
        pname,
        "plot.json",
        plots=["plot.json"],
        commit="there is metric",
    )

    remove(subdir_stage.outs[0].cache_path)
    remove(subdir_stage.outs[0].fs_path)

    result = dvc.plots.show()
    assert "error" in result["workspace"]["data"]["subdir"]
    # make sure others gets loaded
    assert result["workspace"]["data"]["plot.json"]["data"] == metric
Example #29
0
def test_pull_imported_stage_from_subrepos(tmp_dir, dvc, erepo_dir, is_dvc,
                                           files):
    subrepo = erepo_dir / "subrepo"
    make_subrepo(subrepo, erepo_dir.scm)
    gen = subrepo.dvc_gen if is_dvc else subrepo.scm_gen
    with subrepo.chdir():
        gen(files, commit="files in subrepo")

    key = first(files)
    path = os.path.join("subrepo", key)
    dvc.imp(os.fspath(erepo_dir), path, out="out")

    # clean everything
    remove(dvc.odb.local.cache_dir)
    remove("out")
    makedirs(dvc.odb.local.cache_dir)

    stats = dvc.pull(["out.dvc"])

    expected = [f"out{os.sep}"] if isinstance(files[key], dict) else ["out"]
    assert stats["added"] == expected
    assert (tmp_dir / "out").read_text() == files[key]
Example #30
0
def test_branch_config(tmp_dir, scm):
    tmp_dir.scm_gen("foo", "foo", commit="init")

    scm.checkout("branch", create_new=True)
    dvc = Repo.init()
    with dvc.config.edit() as conf:
        conf["remote"]["branch"] = {"url": "/some/path"}
    scm.add([".dvc"])
    scm.commit("init dvc")
    scm.checkout("master")

    remove(".dvc")

    # sanity check
    with pytest.raises(NotDvcRepoError):
        Repo()

    with pytest.raises(NotDvcRepoError):
        Repo(scm=scm, rev="master")

    dvc = Repo(scm=scm, rev="branch")
    assert dvc.config["remote"]["branch"]["url"] == "/some/path"