Example #1
0
def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
    with Repo.open(repo, rev=rev, subrepos=True, uninitialized=True) as _repo:
        with _repo.open_by_relpath(
            path, remote=remote, mode=mode, encoding=encoding
        ) as fd:
            yield fd
Example #2
0
 def setUp(self):
     super().setUp()
     ret = main(["config", "cache.type", "hardlink"])
     self.assertEqual(ret, 0)
     self.dvc = DvcRepo(".")
Example #3
0
def test_absolute_file_outside_git_repo(tmp_dir, erepo_dir):
    erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True)
    erepo_dir.scm.commit("remove dvc")

    with pytest.raises(PathMissingError):
        Repo.get(fspath(erepo_dir), "/root/")
Example #4
0
    def exp_dvc(self):
        """Return clone dvc Repo instance."""
        from dvc.repo import Repo

        return Repo(self.exp_dvc_dir)
Example #5
0
 def stages():
     return set(stage.relpath for stage in Repo(fspath(tmp_dir)).stages)
Example #6
0
def test_api_init(scm):
    DvcRepo.init().close()
    assert os.path.isdir(DvcRepo.DVC_DIR)
Example #7
0
def test_init_no_scm_api(tmp_dir):
    repo = DvcRepo.init(no_scm=True)

    assert (tmp_dir / DvcRepo.DVC_DIR).is_dir()
    assert repo.config["core"]["no_scm"]
Example #8
0
 def test_api(self):
     with self.assertRaises(InitError):
         DvcRepo.init()
Example #9
0
    def reproduce(
        cls,
        dvc_dir: str,
        queue: "Queue",
        rev: str,
        cwd: Optional[str] = None,
        name: Optional[str] = None,
    ) -> Tuple[bool, Optional[str]]:
        """Run dvc repro and return the result.

        Returns tuple of (exp_hash, force) where exp_hash is the experiment
            hash (or None on error) and force is a bool specifying whether or
            not this experiment should force overwrite any existing duplicates.
        """
        unchanged = []

        queue.put((rev, os.getpid()))

        def filter_pipeline(stages):
            unchanged.extend([
                stage for stage in stages if isinstance(stage, PipelineStage)
            ])

        result = None
        force = False

        try:
            dvc = Repo(dvc_dir)
            old_cwd = os.getcwd()
            new_cwd = cwd if cwd else dvc.root_dir
            os.chdir(new_cwd)
            logger.debug("Running repro in '%s'", cwd)

            args_path = os.path.join(dvc.tmp_dir,
                                     BaseExecutor.PACKED_ARGS_FILE)
            if os.path.exists(args_path):
                args, kwargs = BaseExecutor.unpack_repro_args(args_path)
                remove(args_path)
            else:
                args = []
                kwargs = {}

            force = kwargs.get("force", False)

            # NOTE: for checkpoint experiments we handle persist outs slightly
            # differently than normal:
            #
            # - checkpoint out may not yet exist if this is the first time this
            #   experiment has been run, this is not an error condition for
            #   experiments
            # - at the start of a repro run, we need to remove the persist out
            #   and restore it to its last known (committed) state (which may
            #   be removed/does not yet exist) so that our executor workspace
            #   is not polluted with the (persistent) out from an unrelated
            #   experiment run
            dvc.checkout(force=True, quiet=True)

            # We cannot use dvc.scm to make commits inside the executor since
            # cached props are not picklable.
            scm = Git()
            checkpoint_func = partial(cls.checkpoint_callback, scm, name)
            stages = dvc.reproduce(
                *args,
                on_unchanged=filter_pipeline,
                checkpoint_func=checkpoint_func,
                **kwargs,
            )

            exp_hash = cls.hash_exp(stages)
            exp_rev = cls.commit(scm, exp_hash, exp_name=name)
            if scm.get_ref(EXEC_CHECKPOINT):
                scm.set_ref(EXEC_CHECKPOINT, exp_rev)
        except UnchangedExperimentError:
            pass
        finally:
            if scm:
                scm.close()
                del scm
            if old_cwd:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return result, force
Example #10
0
 def _list_files(repo, path=None):
     return set(map(itemgetter("path"), Repo.ls(os.fspath(repo), path)))
Example #11
0
    def test_api(self):
        DvcRepo.init()

        self._test_init()
Example #12
0
 def _ls(path):
     return Repo.ls(os.fspath(erepo_dir), path)
Example #13
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
    )

    from contextlib import ExitStack
    from dvc.repo import Repo

    all_repos = []

    if repos:
        all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    _do_gc("local", self.cache.local.gc, used, jobs)

    if self.cache.s3:
        _do_gc("s3", self.cache.s3.gc, used, jobs)

    if self.cache.gs:
        _do_gc("gs", self.cache.gs.gc, used, jobs)

    if self.cache.ssh:
        _do_gc("ssh", self.cache.ssh.gc, used, jobs)

    if self.cache.hdfs:
        _do_gc("hdfs", self.cache.hdfs.gc, used, jobs)

    if self.cache.azure:
        _do_gc("azure", self.cache.azure.gc, used, jobs)

    if cloud:
        _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used, jobs)
Example #14
0
 def setUp(self):
     super(TestShouldNotCheckoutUponCorruptedLocalHardlinkCache,
           self).setUp()
     ret = main(["config", "cache.type", "hardlink"])
     self.assertEqual(ret, 0)
     self.dvc = DvcRepo(".")
Example #15
0
File: root.py Project: ye-man/dvc
 def run(self):
     logger.info(relpath(Repo.find_root()))
     return 0
Example #16
0
def test_run_without_cmd(kwargs):
    with pytest.raises(InvalidArgumentError) as exc:
        Repo().run(**kwargs)
    assert "command is not specified" == str(exc.value)
Example #17
0
File: test_gc.py Project: yk/dvc
class TestGCMultipleDvcRepos(TestDvcGit):
    def _check_cache(self, num):
        total = 0
        for root, dirs, files in os.walk(os.path.join(".dvc", "cache")):
            total += len(files)
        self.assertEqual(total, num)

    def setUp(self):
        super(TestGCMultipleDvcRepos, self).setUp()
        self.additional_path = TestDir.mkdtemp()
        self.additional_git = Repo.init(self.additional_path)
        self.additional_dvc = DvcRepo.init(self.additional_path)

        cache_path = os.path.join(self._root_dir, ".dvc", "cache")
        config_path = os.path.join(self.additional_path, ".dvc",
                                   "config.local")
        cfg = configobj.ConfigObj()
        cfg.filename = config_path
        cfg["cache"] = {"dir": cache_path}
        cfg.write()

        self.additional_dvc = DvcRepo(self.additional_path)

    def test(self):

        # ADD FILE ONLY IN MAIN PROJECT
        fname = "only_in_first"
        with open(fname, "w+") as fobj:
            fobj.write("only in main repo")

        stages = self.dvc.add(fname)
        self.assertEqual(len(stages), 1)

        # ADD FILE IN MAIN PROJECT THAT IS ALSO IN SECOND PROJECT
        fname = "in_both"
        with open(fname, "w+") as fobj:
            fobj.write("in both repos")

        stages = self.dvc.add(fname)
        self.assertEqual(len(stages), 1)

        cwd = os.getcwd()
        os.chdir(self.additional_path)
        # ADD FILE ONLY IN SECOND PROJECT
        fname = os.path.join(self.additional_path, "only_in_second")
        with open(fname, "w+") as fobj:
            fobj.write("only in additional repo")

        stages = self.additional_dvc.add(fname)
        self.assertEqual(len(stages), 1)

        # ADD FILE IN SECOND PROJECT THAT IS ALSO IN MAIN PROJECT
        fname = os.path.join(self.additional_path, "in_both")
        with open(fname, "w+") as fobj:
            fobj.write("in both repos")

        stages = self.additional_dvc.add(fname)
        self.assertEqual(len(stages), 1)

        os.chdir(cwd)

        self._check_cache(3)

        self.dvc.gc(repos=[self.additional_path])
        self._check_cache(3)

        self.dvc.gc()
        self._check_cache(2)
Example #18
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    all_experiments=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # `all_experiments` or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
        all_experiments=all_experiments,
    )

    from contextlib import ExitStack

    from dvc.data.db import get_index
    from dvc.data.gc import gc as ogc
    from dvc.repo import Repo

    if not repos:
        repos = []
    all_repos = [Repo(path) for path in repos]

    used_obj_ids = set()
    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)

        for repo in all_repos + [self]:
            for obj_ids in repo.used_objs(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    all_experiments=all_experiments,
                    remote=remote,
                    force=force,
                    jobs=jobs,
            ).values():
                used_obj_ids.update(obj_ids)

    for scheme, odb in self.odb.by_scheme():
        if not odb:
            continue

        removed = ogc(odb, used_obj_ids, jobs=jobs)
        if not removed:
            logger.info(f"No unused '{scheme}' cache to remove.")

    if not cloud:
        return

    odb = self.cloud.get_remote_odb(remote, "gc -c")
    removed = ogc(odb, used_obj_ids, jobs=jobs)
    if removed:
        get_index(odb).clear()
    else:
        logger.info("No unused cache to remove from remote.")
Example #19
0
def test_init_no_scm_fail_api(tmp_dir):
    with pytest.raises(InitError):
        DvcRepo.init()
Example #20
0
def test_ls_repo_with_removed_dvc_dir_with_path_file(tmp_dir, dvc, scm):
    create_dvc_pipeline(tmp_dir, dvc)

    path = os.path.join("out", "file")
    files = Repo.ls(os.fspath(tmp_dir), path)
    match_files(files, ((("file", ), True), ))
Example #21
0
def test_gen_dvcignore(tmp_dir):
    DvcRepo.init(no_scm=True)
    text = ("# Add patterns of files dvc should ignore, which could improve\n"
            "# the performance. Learn more at\n"
            "# https://dvc.org/doc/user-guide/dvcignore\n")
    assert text == (tmp_dir / ".dvcignore").read_text()
Example #22
0
def test_ls_not_existed_url():
    from time import time

    dirname = "__{}_{}".format("not_existed", time())
    with pytest.raises(CloneError):
        Repo.ls(dirname, recursive=True)
Example #23
0
def _scm_in_use():
    try:
        scm = SCM(root_dir=Repo.find_root())
        return type(scm).__name__
    except NotDvcRepoError:
        pass
Example #24
0
class TestReproExternalHTTP(TestReproExternalBase):
    _external_cache_id = None

    @staticmethod
    def get_remote(port):
        return "http://localhost:{}/".format(port)

    @property
    def local_cache(self):
        return os.path.join(self.dvc.dvc_dir, "cache")

    def test(self):
        # Import
        with StaticFileServer() as httpd:
            import_url = urljoin(self.get_remote(httpd.server_port), self.FOO)
            import_output = "imported_file"
            import_stage = self.dvc.imp_url(import_url, import_output)

        self.assertTrue(os.path.exists(import_output))
        self.assertTrue(filecmp.cmp(import_output, self.FOO, shallow=False))

        self.dvc.remove("imported_file.dvc")

        with StaticFileServer(handler_class=ContentMD5Handler) as httpd:
            import_url = urljoin(self.get_remote(httpd.server_port), self.FOO)
            import_output = "imported_file"
            import_stage = self.dvc.imp_url(import_url, import_output)

        self.assertTrue(os.path.exists(import_output))
        self.assertTrue(filecmp.cmp(import_output, self.FOO, shallow=False))

        # Run --deps
        with StaticFileServer() as httpd:
            remote = self.get_remote(httpd.server_port)

            cache_id = str(uuid.uuid4())
            cache = urljoin(remote, cache_id)

            ret1 = main(["remote", "add", "mycache", cache])
            ret2 = main(["remote", "add", "myremote", remote])
            self.assertEqual(ret1, 0)
            self.assertEqual(ret2, 0)

            self.dvc = DvcRepo(".")

            run_dependency = urljoin(remote, self.BAR)
            run_output = "remote_file"
            cmd = 'open("{}", "w+")'.format(run_output)

            with open("create-output.py", "w") as fd:
                fd.write(cmd)

            run_stage = self.dvc.run(
                deps=[run_dependency],
                outs=[run_output],
                cmd="python create-output.py",
            )
            self.assertTrue(run_stage is not None)

            self.assertTrue(os.path.exists(run_output))

            # Pull
            self.dvc.remove(import_stage.path, outs_only=True)
            self.assertFalse(os.path.exists(import_output))

            shutil.move(self.local_cache, cache_id)
            self.assertFalse(os.path.exists(self.local_cache))

            self.dvc.pull([import_stage.path], remote="mycache")

            self.assertTrue(os.path.exists(import_output))
Example #25
0
 def setUp(self):
     super(TestDvcFixture, self).setUp()
     self.dvc = DvcRepo.init(self._root_dir)
     self.dvc.scm.commit("init dvc")
Example #26
0
class TestReproExternalBase(TestDvc):
    @staticmethod
    def should_test():
        return False

    @property
    def cache_scheme(self):
        return self.scheme

    @property
    def cache_type(self):
        return "copy"

    @property
    def scheme(self):
        return None

    @property
    def scheme_sep(self):
        return "://"

    @property
    def sep(self):
        return "/"

    def check_already_cached(self, stage):
        stage.outs[0].remove()

        patch_download = patch.object(stage.deps[0],
                                      "download",
                                      wraps=stage.deps[0].download)

        patch_checkout = patch.object(stage.outs[0],
                                      "checkout",
                                      wraps=stage.outs[0].checkout)

        patch_run = patch.object(stage, "_run", wraps=stage._run)

        with self.dvc.lock, self.dvc.state:
            with patch_download as mock_download:
                with patch_checkout as mock_checkout:
                    with patch_run as mock_run:
                        stage.locked = False
                        stage.run()
                        stage.locked = True

                        mock_run.assert_not_called()
                        mock_download.assert_not_called()
                        mock_checkout.assert_called_once()

    @patch("dvc.prompt.confirm", return_value=True)
    def test(self, mock_prompt):
        if not self.should_test():
            raise SkipTest("Test {} is disabled".format(
                self.__class__.__name__))

        cache = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                 str(uuid.uuid4()))

        ret = main(["config", "cache." + self.cache_scheme, "myrepo"])
        self.assertEqual(ret, 0)
        ret = main(["remote", "add", "myrepo", cache])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", "myrepo", "type", self.cache_type])
        self.assertEqual(ret, 0)

        remote_name = "myremote"
        remote_key = str(uuid.uuid4())
        remote = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                  remote_key)

        ret = main(["remote", "add", remote_name, remote])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", remote_name, "type", self.cache_type])
        self.assertEqual(ret, 0)

        self.dvc = DvcRepo(".")

        foo_key = remote_key + self.sep + self.FOO
        bar_key = remote_key + self.sep + self.BAR

        foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    foo_key)
        bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    bar_key)

        # Using both plain and remote notation
        out_foo_path = "remote://" + remote_name + "/" + self.FOO
        out_bar_path = bar_path

        self.write(self.bucket, foo_key, self.FOO_CONTENTS)

        import_stage = self.dvc.imp_url(out_foo_path, "import")

        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})
        self.check_already_cached(import_stage)

        import_remote_stage = self.dvc.imp_url(out_foo_path,
                                               out_foo_path + "_imported")
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        cmd_stage = self.dvc.run(
            outs=[out_bar_path],
            deps=[out_foo_path],
            cmd=self.cmd(foo_path, bar_path),
        )

        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
        self.assertEqual(self.dvc.status(), {})
        self.check_already_cached(cmd_stage)

        self.write(self.bucket, foo_key, self.BAR_CONTENTS)

        self.assertNotEqual(self.dvc.status(), {})

        self.dvc.update(import_stage.path)
        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})

        self.dvc.update(import_remote_stage.path)
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        stages = self.dvc.reproduce(cmd_stage.path)
        self.assertEqual(len(stages), 1)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})

        self.assertEqual(self.dvc.status(), {})
        self.dvc.gc()
        self.assertEqual(self.dvc.status(), {})

        self.dvc.remove(cmd_stage.path, outs_only=True)
        self.assertNotEqual(self.dvc.status([cmd_stage.path]), {})

        self.dvc.checkout([cmd_stage.path], force=True)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
Example #27
0
def test_absolute_file_outside_repo(tmp_dir, erepo_dir):
    with pytest.raises(PathMissingError):
        Repo.get(fspath(erepo_dir), "/root/")
Example #28
0
    def test(self, mock_prompt):
        if not self.should_test():
            raise SkipTest("Test {} is disabled".format(
                self.__class__.__name__))

        cache = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                 str(uuid.uuid4()))

        ret = main(["config", "cache." + self.cache_scheme, "myrepo"])
        self.assertEqual(ret, 0)
        ret = main(["remote", "add", "myrepo", cache])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", "myrepo", "type", self.cache_type])
        self.assertEqual(ret, 0)

        remote_name = "myremote"
        remote_key = str(uuid.uuid4())
        remote = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                  remote_key)

        ret = main(["remote", "add", remote_name, remote])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", remote_name, "type", self.cache_type])
        self.assertEqual(ret, 0)

        self.dvc = DvcRepo(".")

        foo_key = remote_key + self.sep + self.FOO
        bar_key = remote_key + self.sep + self.BAR

        foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    foo_key)
        bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    bar_key)

        # Using both plain and remote notation
        out_foo_path = "remote://" + remote_name + "/" + self.FOO
        out_bar_path = bar_path

        self.write(self.bucket, foo_key, self.FOO_CONTENTS)

        import_stage = self.dvc.imp_url(out_foo_path, "import")

        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})
        self.check_already_cached(import_stage)

        import_remote_stage = self.dvc.imp_url(out_foo_path,
                                               out_foo_path + "_imported")
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        cmd_stage = self.dvc.run(
            outs=[out_bar_path],
            deps=[out_foo_path],
            cmd=self.cmd(foo_path, bar_path),
        )

        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
        self.assertEqual(self.dvc.status(), {})
        self.check_already_cached(cmd_stage)

        self.write(self.bucket, foo_key, self.BAR_CONTENTS)

        self.assertNotEqual(self.dvc.status(), {})

        self.dvc.update(import_stage.path)
        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})

        self.dvc.update(import_remote_stage.path)
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        stages = self.dvc.reproduce(cmd_stage.path)
        self.assertEqual(len(stages), 1)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})

        self.assertEqual(self.dvc.status(), {})
        self.dvc.gc()
        self.assertEqual(self.dvc.status(), {})

        self.dvc.remove(cmd_stage.path, outs_only=True)
        self.assertNotEqual(self.dvc.status([cmd_stage.path]), {})

        self.dvc.checkout([cmd_stage.path], force=True)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
Example #29
0
def test_unknown_path(tmp_dir, erepo_dir):
    with pytest.raises(PathMissingError):
        Repo.get(fspath(erepo_dir), "a_non_existing_file")
Example #30
0
def test_get_a_dvc_file(tmp_dir, erepo_dir):
    with pytest.raises(GetDVCFileError):
        Repo.get(os.fspath(erepo_dir), "some_file.dvc")