Example #1
0
def test_download_callbacks_on_dvc_git_fs(tmp_dir, dvc, scm, fs_type):
    from dvc.fs.git import GitFileSystem

    gen = tmp_dir.scm_gen if fs_type == "git" else tmp_dir.dvc_gen
    gen({"dir": {"foo": "foo", "bar": "bar"}, "file": "file"}, commit="gen")

    fs = dvc.dvcfs if fs_type == "dvc" else GitFileSystem(scm=scm, rev="HEAD")

    callback = fsspec.Callback()
    fs.download_file(
        "file",
        (tmp_dir / "file2").fs_path,
        callback=callback,
    )

    size = os.path.getsize(tmp_dir / "file")
    assert (tmp_dir / "file2").read_text() == "file"
    assert callback.size == size
    assert callback.value == size

    callback = fsspec.Callback()
    fs.download(
        "dir",
        (tmp_dir / "dir2").fs_path,
        callback=callback,
    )

    assert (tmp_dir / "dir2").read_text() == {"foo": "foo", "bar": "bar"}
    assert callback.size == 2
    assert callback.value == 2
Example #2
0
    def get_fs(self, rev: str, **kwargs):
        from dvc.fs.git import GitFileSystem

        from .objects import GitTrie

        resolved = self.resolve_rev(rev)
        tree_obj = self._backend_func("get_tree_obj", rev=resolved)
        trie = GitTrie(tree_obj, resolved)
        return GitFileSystem(self.root_dir, trie, **kwargs)
Example #3
0
    def get_fs(self, rev: str):
        from dvc.fs.git import GitFileSystem

        from .objects import GitTrie

        resolved = self.resolve_rev(rev)
        tree_obj = self.pygit2.get_tree_obj(rev=resolved)
        trie = GitTrie(tree_obj, resolved)
        return GitFileSystem(self.root_dir, trie)
Example #4
0
def test_ignore_on_branch(tmp_dir, scm, dvc):
    from dvc.fs.git import GitFileSystem

    tmp_dir.scm_gen({"foo": "foo", "bar": "bar"}, commit="add files")

    with tmp_dir.branch("branch", new=True):
        tmp_dir.scm_gen(DvcIgnore.DVCIGNORE_FILE, "foo", commit="add ignore")

    dvc._reset()

    result = walk_files(dvc, dvc.fs, tmp_dir)
    assert set(result) == {
        (tmp_dir / "foo").fs_path,
        (tmp_dir / "bar").fs_path,
        (tmp_dir / DvcIgnore.DVCIGNORE_FILE).fs_path,
    }

    dvc.fs = GitFileSystem(scm=scm, rev="branch")
    assert dvc.dvcignore.is_ignored_file(tmp_dir / "foo")
Example #5
0
    def _get_repo_dirs(
        self,
        root_dir: str = None,
        scm: "Base" = None,
        rev: str = None,
        uninitialized: bool = False,
    ):
        assert bool(scm) == bool(rev)

        from dvc.fs.git import GitFileSystem
        from dvc.scm import SCM, Base, Git, SCMError
        from dvc.utils.fs import makedirs

        dvc_dir = None
        tmp_dir = None
        try:
            fs = (
                GitFileSystem(scm=scm, rev=rev)
                if isinstance(scm, Git) and rev
                else None
            )
            root_dir = self.find_root(root_dir, fs)
            dvc_dir = os.path.join(root_dir, self.DVC_DIR)
            tmp_dir = os.path.join(dvc_dir, "tmp")
            makedirs(tmp_dir, exist_ok=True)
        except NotDvcRepoError:
            if not uninitialized:
                raise

            try:
                scm = SCM(root_dir or os.curdir)
            except SCMError:
                scm = SCM(os.curdir, no_scm=True)

            assert isinstance(scm, Base)
            root_dir = scm.root_dir

        return root_dir, dvc_dir, tmp_dir
Example #6
0
    def __init__(
        self,
        root_dir=None,
        fs=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
        config=None,
        url=None,
        repo_factory=None,
    ):
        from dvc.config import Config
        from dvc.data.db import ODBManager
        from dvc.data_cloud import DataCloud
        from dvc.fs.git import GitFileSystem
        from dvc.fs.local import localfs
        from dvc.lock import LockNoop, make_lock
        from dvc.repo.live import Live
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.repo.stage import StageLoad
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop

        self.url = url
        self._fs_conf = {"repo_factory": repo_factory}
        self._fs = fs or localfs
        self._scm = None

        if rev and not fs:
            self._scm = SCM(root_dir or os.curdir)
            self._fs = GitFileSystem(scm=self._scm, rev=rev)

        self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs(
            root_dir=root_dir, fs=self.fs, uninitialized=uninitialized)

        self.config = Config(self.dvc_dir, fs=self.fs, config=config)
        self._uninitialized = uninitialized

        # used by RepoFileSystem to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cloud = DataCloud(self)
        self.stage = StageLoad(self)

        if isinstance(self.fs, GitFileSystem) or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
            self.odb = ODBManager(self)
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            state_db_dir = self._get_database_dir("state")
            self.state = State(self.root_dir, state_db_dir, self.dvcignore)
            self.odb = ODBManager(self)

            self.stage_cache = StageCache(self)

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
        self.live = Live(self)

        self.stage_collection_error_handler: Optional[Callable[
            [str, Exception], None]] = None
        self._lock_depth = 0
Example #7
0
    def get_fs(self, rev: str):
        from dvc.fs.git import GitFileSystem

        return GitFileSystem(scm=self, rev=rev)
Example #8
0
def external_repo(url,
                  rev=None,
                  for_write=False,
                  cache_dir=None,
                  cache_types=None,
                  **kwargs):
    from scmrepo.git import Git

    from dvc.config import NoRemoteError
    from dvc.fs.git import GitFileSystem

    logger.debug("Creating external repo %s@%s", url, rev)
    path = _cached_clone(url, rev, for_write=for_write)
    # Local HEAD points to the tip of whatever branch we first cloned from
    # (which may not be the default branch), use origin/HEAD here to get
    # the tip of the default branch
    rev = rev or "refs/remotes/origin/HEAD"

    cache_config = {
        "cache": {
            "dir": cache_dir or _get_cache_dir(url),
            "type": cache_types
        }
    }

    config = _get_remote_config(url) if os.path.isdir(url) else {}
    config.update(cache_config)

    if for_write:
        root_dir = path
        fs = None
    else:
        root_dir = os.path.realpath(path)
        scm = Git(root_dir)
        fs = GitFileSystem(scm=scm, rev=rev)

    repo_kwargs = dict(
        root_dir=root_dir,
        url=url,
        fs=fs,
        config=config,
        repo_factory=erepo_factory(url, cache_config),
        **kwargs,
    )

    if "subrepos" not in repo_kwargs:
        repo_kwargs["subrepos"] = True

    if "uninitialized" not in repo_kwargs:
        repo_kwargs["uninitialized"] = True

    repo = Repo(**repo_kwargs)

    try:
        yield repo
    except NoRemoteError as exc:
        raise NoRemoteInExternalRepoError(url) from exc
    except OutputNotFoundError as exc:
        if exc.repo is repo:
            raise NoOutputInExternalRepoError(exc.output, repo.root_dir,
                                              url) from exc
        raise
    except FileMissingError as exc:
        raise PathMissingError(exc.path, url) from exc
    finally:
        repo.close()
        if for_write:
            _remove(path)