def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): try: abs_path = os.path.join(self.root_dir, path) with open(abs_path, mode, encoding=encoding) as fd: yield fd except FileNotFoundError: raise PathMissingError(path, self.url)
def pull_to(self, path, to_info): """ Pull the corresponding file or directory specified by `path` and checkout it into `to_info`. It works with files tracked by Git and DVC, and also local files outside the repository. """ out = None path_info = PathInfo(self.root_dir) / path with suppress(OutputNotFoundError): (out, ) = self.find_outs_by_path(fspath(path_info), strict=False) try: if out and out.use_cache: self._pull_cached(out, path_info, to_info) return # Check if it is handled by Git (it can't have an absolute path) if os.path.isabs(path): raise FileNotFoundError fs_copy(fspath(path_info), fspath(to_info)) except FileNotFoundError: raise PathMissingError(path, self.url)
def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = "refs/remotes/origin/HEAD" try: repo = ExternalRepo(path, url, rev, for_write=for_write) except NotDvcRepoError: repo = ExternalGitRepo(path, url, rev) try: yield repo except NoRemoteError: raise NoRemoteInExternalRepoError(url) except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) raise except FileMissingError as exc: raise PathMissingError(exc.path, url) finally: repo.close() if for_write: _remove(path)
def fetch_external(self, paths: Iterable, **kwargs): """Fetch specified external repo paths into cache. Returns 3-tuple in the form (downloaded, failed, list(cache_infos)) where cache_infos can be used as checkout targets for the fetched paths. """ download_results = [] failed = 0 root = PathInfo(self.root_dir) paths = [root / path for path in paths] def download_update(result): download_results.append(result) hash_infos = [] for path in paths: with reraise(FileNotFoundError, PathMissingError(path, self.url)): metadata = self.repo_tree.metadata(path) self._check_repo(path, metadata.repo) repo = metadata.repo hash_info = self._fetch_to_cache(path, repo, download_update, **kwargs) hash_infos.append(hash_info) return sum(download_results), failed, hash_infos
def ls(url, target=None, rev=None, recursive=None, outs_only=False): from dvc.external_repo import external_repo from dvc.repo import Repo from dvc.utils import relpath with external_repo(url, rev) as repo: target_path_info = _get_target_path_info(repo, target) result = [] if isinstance(repo, Repo): result.extend(_ls_outs_repo(repo, target_path_info, recursive)) if not outs_only: result.extend(_ls_files_repo(target_path_info, recursive)) if target and not result: raise PathMissingError(target, repo, output_only=outs_only) def prettify(path_info): if path_info == target_path_info: return path_info.name return relpath(path_info, target_path_info) result = list(set(map(prettify, result))) result.sort() return result
def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = rev or "refs/remotes/origin/HEAD" root_dir = path if for_write else os.path.realpath(path) conf = dict( root_dir=root_dir, url=url, scm=None if for_write else Git(root_dir), rev=None if for_write else rev, for_write=for_write, uninitialized=True, **kwargs, ) repo = ExternalRepo(**conf) try: yield repo except NoRemoteError as exc: raise NoRemoteInExternalRepoError(url) from exc except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) from exc raise except FileMissingError as exc: raise PathMissingError(exc.path, url) from exc finally: repo.close() if for_write: _remove(path)
def fetch_external(self, paths: Iterable, **kwargs): """Fetch specified external repo paths into cache. Returns 3-tuple in the form (downloaded, failed, list(cache_infos)) where cache_infos can be used as checkout targets for the fetched paths. """ download_results = [] failed = 0 paths = [PathInfo(self.root_dir) / path for path in paths] def download_update(result): download_results.append(result) save_infos = [] for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) save_info = self.local_cache.save( path, None, tree=self.repo_tree, download_callback=download_update, ) save_infos.append(save_info) return sum(download_results), failed, save_infos
def external_repo(url, rev=None, for_write=False, cache_dir=None, cache_types=None, **kwargs): from dvc.config import NoRemoteError from dvc.scm.git import Git logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = rev or "refs/remotes/origin/HEAD" cache_config = { "cache": { "dir": cache_dir or _get_cache_dir(url), "type": cache_types } } config = _get_remote_config(url) if os.path.isdir(url) else {} config.update(cache_config) root_dir = path if for_write else os.path.realpath(path) repo_kwargs = dict( root_dir=root_dir, url=url, scm=None if for_write else Git(root_dir), rev=None if for_write else rev, config=config, repo_factory=erepo_factory(url, cache_config), **kwargs, ) if "subrepos" not in repo_kwargs: repo_kwargs["subrepos"] = True if "uninitialized" not in repo_kwargs: repo_kwargs["uninitialized"] = True repo = Repo(**repo_kwargs) try: yield repo except NoRemoteError as exc: raise NoRemoteInExternalRepoError(url) from exc except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) from exc raise except FileMissingError as exc: raise PathMissingError(exc.path, url) from exc finally: repo.close() if for_write: _remove(path)
def ls( url, target=None, rev=None, recursive=None, outs_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url target (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo outs_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from dvc.external_repo import external_repo from dvc.repo import Repo from dvc.utils import relpath with external_repo(url, rev) as repo: target_path_info = _get_target_path_info(repo, target) fs_nodes = [] if isinstance(repo, Repo): fs_nodes.extend(_ls_outs_repo(repo, target_path_info, recursive)) if not outs_only: fs_nodes.extend(_ls_files_repo(target_path_info, recursive)) if target and not fs_nodes: raise PathMissingError(target, repo, output_only=outs_only) fs_nodes = {n["path_info"]: n for n in fs_nodes}.values() def get_entry(fs_node): path_info = fs_node["path_info"] path = (path_info.name if path_info == target_path_info else relpath(path_info, target_path_info)) return { "path": path, "isout": fs_node.get("isout", False), "isdir": fs_node.get("isdir", False), "isexec": fs_node.get("isexec", False), } entries = sorted(map(get_entry, fs_nodes), key=lambda f: f["path"]) return entries
def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" try: abs_path = os.path.join(self.root_dir, path) with open(abs_path, mode, encoding=encoding) as fd: yield fd except FileNotFoundError: raise PathMissingError(path, self.url)
def get_checksum(self, path): path_info = PathInfo(self.root_dir) / path with reraise(FileNotFoundError, PathMissingError(path, self.url)): metadata = self.repo_tree.metadata(path_info) # skip subrepos to check for tree = self._get_tree_for(metadata.repo) return tree.get_hash(path_info)
def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" tree = RepoTree(self) try: with tree.open(path, mode=mode, encoding=encoding, **kwargs) as fobj: yield fobj except FileNotFoundError: raise PathMissingError(path, self.url)
def pull_to(self, path, to_info): try: # Git handled files can't have absolute path if os.path.isabs(path): raise FileNotFoundError fs_copy(os.path.join(self.root_dir, path), fspath(to_info)) except FileNotFoundError: raise PathMissingError(path, self.url)
def _get_used_and_obj( self, obj_only=False, **kwargs ) -> Tuple[Dict[Optional["ObjectDB"], Set["HashInfo"]], "HashFile"]: from dvc.config import NoRemoteError from dvc.exceptions import NoOutputOrStageError, PathMissingError from dvc.objects.stage import stage from dvc.objects.tree import Tree local_odb = self.repo.odb.local locked = kwargs.pop("locked", True) with self._make_repo(locked=locked, cache_dir=local_odb.cache_dir) as repo: used_obj_ids = defaultdict(set) rev = repo.get_rev() if locked and self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = rev path_info = PathInfo(repo.root_dir) / str(self.def_path) if not obj_only: try: for odb, obj_ids in repo.used_objs( [os.fspath(path_info)], force=True, jobs=kwargs.get("jobs"), recursive=True, ).items(): if odb is None: odb = repo.cloud.get_remote_odb() odb.read_only = True self._check_circular_import(odb, obj_ids) used_obj_ids[odb].update(obj_ids) except (NoRemoteError, NoOutputOrStageError): pass try: staging, staged_obj = stage( local_odb, path_info, repo.repo_fs, local_odb.fs.PARAM_CHECKSUM, ) except FileNotFoundError as exc: raise PathMissingError(self.def_path, self.def_repo[self.PARAM_URL]) from exc staging = copy(staging) staging.read_only = True self._staged_objs[rev] = staged_obj used_obj_ids[staging].add(staged_obj.hash_info) if isinstance(staged_obj, Tree): used_obj_ids[staging].update(entry.hash_info for _, entry in staged_obj) return used_obj_ids, staged_obj
def ls( url, path=None, rev=None, recursive=None, outs_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url path (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo outs_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from dvc.external_repo import external_repo from dvc.repo import Repo with external_repo(url, rev) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path ret = {} if isinstance(repo, Repo): ret = _ls(repo, path_info, recursive, True) nondvc = {} if not outs_only: nondvc = _ls(repo, path_info, recursive, False) ret.update(nondvc) if path and not ret: raise PathMissingError(path, repo, output_only=outs_only) ret_list = [] for path, info in ret.items(): info["path"] = path ret_list.append(info) ret_list.sort(key=lambda f: f["path"]) return ret_list
def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" path_info = PathInfo(self.root_dir) / path try: with self.repo_tree.open(path_info, mode=mode, encoding=encoding, **kwargs) as fobj: yield fobj except FileNotFoundError as exc: raise PathMissingError(path, self.url) from exc
def download(self, to): try: if self._copy_if_git_file(to.fspath): return out = self.fetch() to.info = copy.copy(out.info) to.checkout() except (FileNotFoundError): raise PathMissingError(self.def_path, self.def_repo[self.PARAM_URL])
def get_external(self, path, dest): """Convenience wrapper for fetch_external and checkout.""" if self.local_cache: # fetch DVC and git files to tmpdir cache, then checkout _, _, save_infos = self.fetch_external([path]) self.local_cache.checkout(PathInfo(dest), save_infos[0]) else: # git-only erepo with no cache, just copy files directly # to dest path = PathInfo(self.root_dir) / path if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) self.repo_tree.copytree(path, dest)
def ls( url, path=None, rev=None, recursive=None, dvc_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url path (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo dvc_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from dvc.external_repo import external_repo # use our own RepoTree instance instead of repo.repo_tree since we want to # fetch directory listings, but don't want to fetch file contents. with external_repo(url, rev, fetch=False, stream=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path ret = _ls(repo.repo_tree, path_info, recursive, dvc_only) if path and not ret: raise PathMissingError(path, repo, dvc_only=dvc_only) ret_list = [] for path, info in ret.items(): info["path"] = path ret_list.append(info) ret_list.sort(key=lambda f: f["path"]) return ret_list
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] output = repo.find_out_by_relpath(path) if output.use_cache: _get_cached(repo, output, out) return # Non-cached output, fall through and try to copy from git. except (NotDvcRepoError, NoOutputInExternalRepoError): # Not a DVC repository or, possibly, path is not tracked by DVC. # Fall through and try to copy from git. pass if os.path.isabs(path): raise FileNotFoundError repo_dir = cached_clone(url, rev=rev) fs_copy(os.path.join(repo_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir)
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] try: output = repo.find_out_by_relpath(path) except OutputNotFoundError: output = None if output and output.use_cache: _get_cached(repo, output, out) else: # Either an uncached out with absolute path or a user error if os.path.isabs(path): raise FileNotFoundError fs_copy(os.path.join(repo.root_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) except NotDvcRepoError: raise UrlNotDvcRepoError(url) finally: remove(tmp_dir)
def get_external(self, path, dest): """Convenience wrapper for fetch_external and checkout.""" path_info = PathInfo(self.root_dir) / path with reraise(FileNotFoundError, PathMissingError(path, self.url)): metadata = self.repo_tree.metadata(path_info) self._check_repo(path_info, metadata.repo) if metadata.output_exists: repo = metadata.repo cache = repo.cache.local # fetch DVC and git files to tmpdir cache, then checkout save_info = self._fetch_to_cache(path_info, repo, None) cache.checkout(PathInfo(dest), save_info) else: # git-only folder, just copy files directly to dest tree = self._get_tree_for(metadata.repo) # ignore subrepos tree.copytree(path_info, dest)
def pull_to(self, path, to_info): try: out = None with suppress(OutputNotFoundError): out = self.find_out_by_relpath(path) if out and out.use_cache: self._pull_cached(out, to_info) return # Git handled files can't have absolute path if os.path.isabs(path): raise FileNotFoundError fs_copy(os.path.join(self.root_dir, path), fspath(to_info)) except FileNotFoundError: raise PathMissingError(path, self.url)
def ls( url, path=None, rev=None, recursive=None, dvc_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url path (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo dvc_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from . import Repo with Repo.open(url, rev, subrepos=True, uninitialized=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path ret = _ls(repo.repo_tree, path_info, recursive, dvc_only) if path and not ret: raise PathMissingError(path, repo, dvc_only=dvc_only) ret_list = [] for path, info in ret.items(): info["path"] = path ret_list.append(info) ret_list.sort(key=lambda f: f["path"]) return ret_list
def get_used_objs(self, **kwargs) -> Dict[Optional["ObjectDB"], Set["HashFile"]]: from dvc.config import NoRemoteError from dvc.exceptions import NoOutputOrStageError, PathMissingError from dvc.objects.db.git import GitObjectDB from dvc.objects.stage import stage local_odb = self.repo.odb.local locked = kwargs.pop("locked", True) with self._make_repo(locked=locked, cache_dir=local_odb.cache_dir) as repo: used_objs = defaultdict(set) rev = repo.get_rev() if locked and self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = rev path_info = PathInfo(repo.root_dir) / str(self.def_path) try: for odb, objs in repo.used_objs( [os.fspath(path_info)], force=True, jobs=kwargs.get("jobs"), recursive=True, ).items(): if odb is None: odb = repo.cloud.get_remote().odb self._check_circular_import(odb) used_objs[odb].update(objs) except (NoRemoteError, NoOutputOrStageError): pass try: staged_obj = stage( local_odb, path_info, repo.repo_fs, local_odb.fs.PARAM_CHECKSUM, ) except FileNotFoundError as exc: raise PathMissingError(self.def_path, self.def_repo[self.PARAM_URL]) from exc self._staged_objs[rev] = staged_obj git_odb = GitObjectDB(repo.repo_fs, repo.root_dir) used_objs[git_odb].add(staged_obj) return used_objs
def external_repo(url, rev=None): path = _cached_clone(url, rev) try: repo = ExternalRepo(path, url) except NotDvcRepoError: repo = ExternalGitRepo(path, url) try: yield repo except NoRemoteError: raise NoRemoteInExternalRepoError(url) except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) raise except FileMissingError as exc: raise PathMissingError(exc.path, url) finally: repo.close()
def _ls( repo: "Repo", path: str, recursive: bool = None, dvc_only: bool = False ): fs: "DvcFileSystem" = repo.dvcfs fs_path = fs.from_os_path(path) try: fs_path = fs.info(fs_path)["name"] except FileNotFoundError: raise PathMissingError(path, repo, dvc_only=dvc_only) infos = {} for root, dirs, files in fs.walk( fs_path, dvcfiles=True, dvc_only=dvc_only ): entries = chain(files, dirs) if not recursive else files for entry in entries: entry_fs_path = fs.path.join(root, entry) relparts = fs.path.relparts(entry_fs_path, fs_path) name = os.path.join(*relparts) infos[name] = fs.info(entry_fs_path) if not recursive: break if not infos and fs.isfile(fs_path): infos[os.path.basename(path)] = fs.info(fs_path) ret = {} for name, info in infos.items(): dvc_info = info.get("dvc_info", {}) if dvc_info.get("outs") or not dvc_only: ret[name] = { "isout": dvc_info.get("isout", False), "isdir": info["type"] == "directory", "isexec": info.get("isexec", False), } return ret
def get_url(path, repo=None, rev=None, remote=None): """ Returns the URL to the storage location of a data file or directory tracked in a DVC repo. For Git repos, HEAD is used unless a rev argument is supplied. The default remote is tried unless a remote argument is supplied. Raises OutputNotFoundError if the file is not tracked by DVC. NOTE: This function does not check for the actual existence of the file or directory in the remote storage. """ with Repo.open(repo, rev=rev, subrepos=True, uninitialized=True) as _repo: fs_path = _repo.fs.path.join(_repo.root_dir, path) with reraise(FileNotFoundError, PathMissingError(path, repo)): info = _repo.repo_fs.info(fs_path) if not info["isdvc"]: raise OutputNotFoundError(path, repo) cloud = info["repo"].cloud md5 = info["repo"].dvcfs.info(fs_path)["md5"] return cloud.get_url_for(remote, checksum=md5)
def get_url(path, repo=None, rev=None, remote=None): """ Returns the URL to the storage location of a data file or directory tracked in a DVC repo. For Git repos, HEAD is used unless a rev argument is supplied. The default remote is tried unless a remote argument is supplied. Raises OutputNotFoundError if the file is not a dvc-tracked file. NOTE: This function does not check for the actual existence of the file or directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: path_info = PathInfo(_repo.root_dir) / path with reraise(FileNotFoundError, PathMissingError(path, repo)): metadata = _repo.repo_tree.metadata(path_info) if not metadata.is_dvc: raise OutputNotFoundError(path, repo) cloud = metadata.repo.cloud hash_info = _repo.repo_tree.get_hash(path_info) return cloud.get_url_for(remote, checksum=hash_info.value)
def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo {}@{}", url, rev) path = _cached_clone(url, rev, for_write=for_write) try: repo = ExternalRepo(path, url) except NotDvcRepoError: repo = ExternalGitRepo(path, url) try: yield repo except NoRemoteError: raise NoRemoteInExternalRepoError(url) except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) raise except FileMissingError as exc: raise PathMissingError(exc.path, url) finally: repo.close() if for_write: _remove(path)