def __init__( self, root_dir, url, scm=None, rev=None, for_write=False, cache_dir=None, cache_types=None, **kwargs, ): self.root_dir = os.path.realpath(root_dir) self.scm = scm self.url = url self.for_write = for_write self.cache_dir = cache_dir or self._get_cache_dir() self.cache_types = cache_types self.rev = rev self.tree_confs = kwargs self.config = {"cache": {"dir": self.cache_dir}} self.cache = Cache(self) if cache_types: self.cache.local.cache_types = cache_types self.state = StateNoop()
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = WorkingTree(self.root_dir) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock repo.state = StateNoop() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] output = None output_error = None try: output = repo.find_out_by_relpath(path) except OutputNotFoundError as ex: output_error = ex is_git_file = output_error and not os.path.isabs(path) is_not_cached = output and not output.use_cache if is_git_file or is_not_cached: _copy_git_file(repo, path, out, url) return if output_error: raise OutputNotFoundError(path) with repo.state: repo.cloud.pull(output.get_used_cache()) output.path_info = PathInfo(os.path.abspath(out)) with output.repo.state: output.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) finally: remove(tmp_dir)
def __init__(self, fs, **config): from dvc.state import StateNoop self.fs = fs self.state = config.get("state", StateNoop()) self.verify = config.get("verify", self.DEFAULT_VERIFY) self.cache_types = config.get("type") or copy(self.DEFAULT_CACHE_TYPES) self.cache_type_confirmed = False self.slow_link_warning = config.get("slow_link_warning", True)
def __init__(self, fs: "FileSystem", path: str, **config): from dvc.state import StateNoop self.fs = fs self.fs_path = path self.state = config.get("state", StateNoop()) self.verify = config.get("verify", self.DEFAULT_VERIFY) self.cache_types = config.get("type") or copy(self.DEFAULT_CACHE_TYPES) self.slow_link_warning = config.get("slow_link_warning", True) self.tmp_dir = config.get("tmp_dir") self.read_only = config.get("read_only", False)
def get(url, path, out=None, rev=None): out = out or os.path.basename(urlparse(path).path) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock repo.state = StateNoop() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) o = repo.find_out_by_relpath(path) with repo.state: repo.cloud.pull(o.get_used_cache()) o.path_info = PathInfo(os.path.abspath(out)) with o.repo.state: o.checkout() finally: remove(tmp_dir)
def test(self): stages = [ self._run( fname="start.dvc", outs=["start.txt"], cmd="echo start > start.txt", name="start", ), self._run( fname="middle.dvc", deps=["start.txt"], outs=["middle.txt"], cmd="echo middle > middle.txt", name="middle", ), self._run( fname="final.dvc", deps=["middle.txt"], outs=["final.txt"], cmd="echo final > final.txt", name="final", ), self._run( fname="disconnected.dvc", outs=["disconnected.txt"], cmd="echo other > disconnected.txt", name="disconnected", ), ] from dvc.state import StateNoop self.dvc.state = StateNoop() with patch.object( Stage, "reproduce", side_effect=stages ) as mock_reproduce: ret = main(["repro", "--all-pipelines"]) self.assertEqual(ret, 0) self.assertEqual(mock_reproduce.call_count, 4)
class RemoteBASE(object): scheme = "base" path_cls = URLInfo REQUIRES = {} JOBS = 4 * cpu_count() PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_CACHE_TYPES = ["copy"] state = StateNoop() def __init__(self, repo, config): self.repo = repo self._check_requires(config) core = config.get(Config.SECTION_CORE, {}) self.checksum_jobs = core.get(Config.SECTION_CORE_CHECKSUM_JOBS, self.CHECKSUM_JOBS) self.protected = False self.no_traverse = config.get(Config.SECTION_REMOTE_NO_TRAVERSE, True) self._dir_info = {} types = config.get(Config.SECTION_CACHE_TYPE, None) if types: if isinstance(types, str): types = [t.strip() for t in types.split(",")] self.cache_types = types else: self.cache_types = copy(self.DEFAULT_CACHE_TYPES) self.cache_type_confirmed = False def _check_requires(self, config): import importlib missing = [] for package, module in self.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) if not missing: return url = config.get(Config.SECTION_REMOTE_URL, "{}://".format(self.scheme)) msg = ("URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install 'dvc[{}]'\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install 'dvc[all]'\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!" ).format(url, missing, " ".join(missing), self.scheme) raise RemoteMissingDepsError(msg) def __repr__(self): return "{class_name}: '{path_info}'".format( class_name=type(self).__name__, path_info=self.path_info or "No path", ) @classmethod def supported(cls, config): if isinstance(config, (str, bytes)): url = config else: url = config[Config.SECTION_REMOTE_URL] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def cache(self): return getattr(self.repo.cache, self.scheme) def get_file_checksum(self, path_info): raise NotImplementedError def _calculate_checksums(self, file_infos): file_infos = list(file_infos) with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: tasks = executor.map(self.get_file_checksum, file_infos) with Tqdm( tasks, total=len(file_infos), unit="md5", desc="Computing hashes (only done once)", ) as tasks: checksums = dict(zip(file_infos, tasks)) return checksums def _collect_dir(self, path_info): file_infos = set() for fname in self.walk_files(path_info): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) checksums = {fi: self.state.get(fi) for fi in file_infos} not_in_state = { fi for fi, checksum in checksums.items() if checksum is None } new_checksums = self._calculate_checksums(not_in_state) checksums.update(new_checksums) result = [ { self.PARAM_CHECKSUM: checksums[fi], # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: fi.relative_to(path_info).as_posix(), } for fi in file_infos ] # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def get_dir_checksum(self, path_info): dir_info = self._collect_dir(path_info) checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): self.cache.makedirs(new_info.parent) self.cache.move(tmp_info, new_info) self.state.save(path_info, checksum) self.state.save(new_info, checksum) return checksum def _get_dir_info_checksum(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.cache.path_info / tmp_fname("") self.cache.upload(from_info, to_info, no_progress_bar=True) checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX return checksum, to_info def get_dir_cache(self, checksum): assert checksum dir_info = self._dir_info.get(checksum) if dir_info: return dir_info try: dir_info = self.load_dir_cache(checksum) except DirCacheError: dir_info = [] self._dir_info[checksum] = dir_info return dir_info def load_dir_cache(self, checksum): path_info = self.checksum_to_path_info(checksum) try: with self.cache.open(path_info, "r") as fobj: d = json.load(fobj) except (ValueError, FileNotFoundError) as exc: raise DirCacheError(checksum, cause=exc) if not isinstance(d, list): msg = "dir cache file format error '{}' [skipping the file]" logger.error(msg.format(relpath(path_info))) return [] for info in d: # NOTE: here is a BUG, see comment to .as_posix() below relative_path = PathInfo.from_posix(info[self.PARAM_RELPATH]) info[self.PARAM_RELPATH] = relative_path.fspath return d @classmethod def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) def get_checksum(self, path_info): assert path_info.scheme == self.scheme if not self.exists(path_info): return None checksum = self.state.get(path_info) # If we have dir checksum in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_checksum() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (checksum and self.is_dir_checksum(checksum) and not self.exists(self.cache.checksum_to_path_info(checksum))): checksum = None if checksum: return checksum if self.isdir(path_info): checksum = self.get_dir_checksum(path_info) else: checksum = self.get_file_checksum(path_info) if checksum: self.state.save(path_info, checksum) return checksum def save_info(self, path_info): return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. A file is considered changed if: - It doesn't exist on the working directory (was unlinked) - Checksum is not computed (saving a new file) - The checkusm stored in the State is different from the given one - There's no file in the cache Args: path_info: dict with path information. checksum: expected checksum for this data. Returns: bool: True if data has changed, False otherwise. """ logger.debug("checking if '{}'('{}') has changed.".format( path_info, checksum_info)) if not self.exists(path_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True checksum = checksum_info.get(self.PARAM_CHECKSUM) if checksum is None: logger.debug("checksum for '{}' is missing.".format(path_info)) return True if self.changed_cache(checksum): logger.debug("cache for '{}'('{}') has changed.".format( path_info, checksum)) return True actual = self.get_checksum(path_info) if checksum != actual: logger.debug( "checksum '{}'(actual '{}') for '{}' has changed.".format( checksum, actual, path_info)) return True logger.debug("'{}' hasn't changed.".format(path_info)) return False def link(self, from_info, to_info): self._link(from_info, to_info, self.cache_types) def _link(self, from_info, to_info, link_types): assert self.isfile(from_info) self.makedirs(to_info.parent) self._try_links(from_info, to_info, link_types) @slow_link_guard def _try_links(self, from_info, to_info, link_types): while link_types: link_method = getattr(self, link_types[0]) try: self._do_link(from_info, to_info, link_method) self.cache_type_confirmed = True return except DvcException as exc: msg = "Cache type '{}' is not supported: {}" logger.debug(msg.format(link_types[0], str(exc))) del link_types[0] raise DvcException("no possible cache types left to try out.") def _do_link(self, from_info, to_info, link_method): if self.exists(to_info): raise DvcException("Link '{}' already exists!".format(to_info)) link_method(from_info, to_info) if self.protected: self.protect(to_info) msg = "Created {}'{}': {} -> {}".format( "protected " if self.protected else "", self.cache_types[0], from_info, to_info, ) logger.debug(msg) def _save_file(self, path_info, checksum, save_link=True): assert checksum cache_info = self.checksum_to_path_info(checksum) if self.changed_cache(checksum): self.move(path_info, cache_info) self.link(cache_info, path_info) elif self.iscopy(path_info) and self._cache_is_copy(path_info): # Default relink procedure involves unneeded copy if self.protected: self.protect(path_info) else: self.unprotect(path_info) else: self.remove(path_info) self.link(cache_info, path_info) if save_link: self.state.save_link(path_info) # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation self.state.save(path_info, checksum) self.state.save(cache_info, checksum) def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" if self.cache_type_confirmed: return self.cache_types[0] == "copy" if set(self.cache_types) <= {"copy"}: return True workspace_file = path_info.with_name("." + uuid()) test_cache_file = self.path_info / ".cache_type_test_file" if not self.exists(test_cache_file): with self.open(test_cache_file, "wb") as fobj: fobj.write(bytes(1)) try: self.link(test_cache_file, workspace_file) finally: self.remove(workspace_file) self.remove(test_cache_file) self.cache_type_confirmed = True return self.cache_types[0] == "copy" def _save_dir(self, path_info, checksum): cache_info = self.checksum_to_path_info(checksum) dir_info = self.get_dir_cache(checksum) for entry in dir_info: entry_info = path_info / entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) self.state.save(cache_info, checksum) self.state.save(path_info, checksum) def is_empty(self, path_info): return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info): """Return a generator with `PathInfo`s to all the files""" raise NotImplementedError @staticmethod def protect(path_info): pass def save(self, path_info, checksum_info): if path_info.scheme != self.scheme: raise RemoteActionNotImplemented( "save {} -> {}".format(path_info.scheme, self.scheme), self.scheme, ) checksum = checksum_info[self.PARAM_CHECKSUM] self._save(path_info, checksum) def _save(self, path_info, checksum): to_info = self.checksum_to_path_info(checksum) logger.debug("Saving '{}' to '{}'.".format(path_info, to_info)) if self.isdir(path_info): self._save_dir(path_info, checksum) return self._save_file(path_info, checksum) def _handle_transfer_exception(self, from_info, to_info, exception, operation): if isinstance(exception, OSError) and exception.errno == errno.EMFILE: raise exception msg = "failed to {} '{}' to '{}'".format(operation, from_info, to_info) logger.exception(msg) return 1 def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) name = name or from_info.name try: self._upload( from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) except Exception as e: return self._handle_transfer_exception(from_info, to_info, e, "upload") return 0 def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) return self._download_file(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with ThreadPoolExecutor(max_workers=self.JOBS) as executor: download_files = partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, ) futures = executor.map(download_files, from_infos, to_infos) with Tqdm( futures, total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as futures: return sum(futures) def _download_file(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name tmp_file = tmp_fname(to_info) try: self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) except Exception as e: return self._handle_transfer_exception(from_info, to_info, e, "download") move(tmp_file, to_info, mode=file_mode) return 0 def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) return open_url(get_url, mode=mode, encoding=encoding) raise RemoteActionNotImplemented("open", self.scheme) def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def move(self, from_info, to_info): self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) def exists(self, path_info): raise NotImplementedError def path_to_checksum(self, path): parts = self.path_cls(path).parts[-2:] if not (len(parts) == 2 and parts[0] and len(parts[0]) == 2): raise ValueError("Bad cache file path") return "".join(parts) def checksum_to_path_info(self, checksum): return self.path_info / checksum[0:2] / checksum[2:] def list_cache_paths(self): raise NotImplementedError def all(self): # NOTE: The list might be way too big(e.g. 100M entries, md5 for each # is 32 bytes, so ~3200Mb list) and we don't really need all of it at # the same time, so it makes sense to use a generator to gradually # iterate over it, without keeping all of it in memory. for path in self.list_cache_paths(): try: yield self.path_to_checksum(path) except ValueError: # We ignore all the non-cache looking files pass def gc(self, named_cache): used = self.extract_used_local_checksums(named_cache) if self.scheme != "": used.update(named_cache[self.scheme]) removed = False for checksum in self.all(): if checksum in used: continue path_info = self.checksum_to_path_info(checksum) self.remove(path_info) removed = True return removed def changed_cache_file(self, checksum): """Compare the given checksum with the (corresponding) actual one. - Use `State` as a cache for computed checksums + The entries are invalidated by taking into account the following: * mtime * inode * size * checksum - Remove the file from cache if it doesn't match the actual checksum """ cache_info = self.checksum_to_path_info(checksum) actual = self.get_checksum(cache_info) logger.debug("cache '{}' expected '{}' actual '{}'".format( str(cache_info), checksum, actual)) if not checksum or not actual: return True if actual.split(".")[0] == checksum.split(".")[0]: return False if self.exists(cache_info): logger.warning("corrupted cache file '{}'.".format(cache_info)) self.remove(cache_info) return True def _changed_dir_cache(self, checksum, path_info=None, filter_info=None): if self.changed_cache_file(checksum): return True if not (path_info and filter_info) and not self._changed_unpacked_dir(checksum): return False for entry in self.get_dir_cache(checksum): entry_checksum = entry[self.PARAM_CHECKSUM] if path_info and filter_info: entry_info = path_info / entry[self.PARAM_RELPATH] if not entry_info.isin_or_eq(filter_info): continue if self.changed_cache_file(entry_checksum): return True if not (path_info and filter_info): self._update_unpacked_dir(checksum) return False def changed_cache(self, checksum, path_info=None, filter_info=None): if self.is_dir_checksum(checksum): return self._changed_dir_cache(checksum, path_info=path_info, filter_info=filter_info) return self.changed_cache_file(checksum) def cache_exists(self, checksums, jobs=None, name=None): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. - No traverse: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see ssh, local). Returns: A list with checksums that were found in the remote """ if not self.no_traverse: return list(set(checksums) & set(self.all())) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(checksums), unit="file", ) as pbar: def exists_with_progress(path_info): ret = self.exists(path_info) pbar.update_desc(str(path_info)) return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = map(self.checksum_to_path_info, checksums) in_remote = executor.map(exists_with_progress, path_infos) ret = list(itertools.compress(checksums, in_remote)) return ret def already_cached(self, path_info): current = self.get_checksum(path_info) if not current: return False return not self.changed_cache(current) def safe_remove(self, path_info, force=False): if not self.exists(path_info): return if not force and not self.already_cached(path_info): msg = ("file '{}' is going to be removed." " Are you sure you want to proceed?".format(str(path_info))) if not prompt.confirm(msg): raise ConfirmRemoveError(str(path_info)) self.remove(path_info) def _checkout_file(self, path_info, checksum, force, progress_callback=None): """The file is changed we need to checkout a new copy""" cache_info = self.checksum_to_path_info(checksum) if self.exists(path_info): msg = "data '{}' exists. Removing before checkout." logger.warning(msg.format(str(path_info))) self.safe_remove(path_info, force=force) self.link(cache_info, path_info) self.state.save_link(path_info) self.state.save(path_info, checksum) if progress_callback: progress_callback(str(path_info)) def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ pass def _checkout_dir( self, path_info, checksum, force, progress_callback=None, relink=False, filter_info=None, ): # Create dir separately so that dir is created # even if there are no files in it if not self.exists(path_info): self.makedirs(path_info) dir_info = self.get_dir_cache(checksum) logger.debug("Linking directory '{}'.".format(path_info)) for entry in dir_info: relative_path = entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info = path_info / relative_path if filter_info and not entry_info.isin_or_eq(filter_info): continue entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum} if relink or self.changed(entry_info, entry_checksum_info): self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) self.state.save(entry_info, entry_checksum) if progress_callback: progress_callback(str(entry_info)) self._remove_redundant_files(path_info, dir_info, force) self.state.save_link(path_info) self.state.save(path_info, checksum) def _remove_redundant_files(self, path_info, dir_info, force): existing_files = set(self.walk_files(path_info)) needed_files = { path_info / entry[self.PARAM_RELPATH] for entry in dir_info } for path in existing_files - needed_files: self.safe_remove(path, force) def checkout( self, path_info, checksum_info, force=False, progress_callback=None, relink=False, filter_info=None, ): if path_info.scheme not in ["local", self.scheme]: raise NotImplementedError checksum = checksum_info.get(self.PARAM_CHECKSUM) failed = None skip = False if not checksum: logger.warning("No checksum info found for '{}'. " "It won't be created.".format(str(path_info))) self.safe_remove(path_info, force=force) failed = path_info elif not relink and not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.debug(msg.format(str(path_info))) skip = True elif self.changed_cache(checksum, path_info=path_info, filter_info=filter_info): msg = "Cache '{}' not found. File '{}' won't be created." logger.warning(msg.format(checksum, str(path_info))) self.safe_remove(path_info, force=force) failed = path_info if failed or skip: if progress_callback: progress_callback( str(path_info), self.get_files_number(self.path_info, checksum, filter_info), ) return failed msg = "Checking out '{}' with cache '{}'." logger.debug(msg.format(str(path_info), checksum)) self._checkout(path_info, checksum, force, progress_callback, relink, filter_info) def _checkout( self, path_info, checksum, force=False, progress_callback=None, relink=False, filter_info=None, ): if not self.is_dir_checksum(checksum): return self._checkout_file(path_info, checksum, force, progress_callback=progress_callback) return self._checkout_dir(path_info, checksum, force, progress_callback, relink, filter_info) def get_files_number(self, path_info, checksum, filter_info): from funcy.py3 import ilen if not checksum: return 0 if not self.is_dir_checksum(checksum): return 1 if not filter_info: return len(self.get_dir_cache(checksum)) return ilen( filter_info.isin_or_eq(path_info / entry[self.PARAM_CHECKSUM]) for entry in self.get_dir_cache(checksum)) @staticmethod def unprotect(path_info): pass def _get_unpacked_dir_names(self, checksums): return set() def extract_used_local_checksums(self, named_cache): used = set(named_cache["local"]) unpacked = self._get_unpacked_dir_names(used) return used | unpacked def _changed_unpacked_dir(self, checksum): return True def _update_unpacked_dir(self, checksum): pass
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler = None self._lock_depth = 0
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs.local import LocalFileSystem from dvc.lock import LockNoop, make_lock from dvc.machine import MachineManager from dvc.objects.db import ODBManager from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} if rev and not scm: scm = SCM(root_dir or os.curdir) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) if scm: self._fs = scm.get_fs(rev) else: self._fs = LocalFileSystem(url=self.root_dir) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized self._scm = scm # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.root_dir, self.tmp_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) if self.tmp_dir and ( self.config["feature"].get("machine", False) or env2bool("DVC_TEST") ): self.machine = MachineManager(self) else: self.machine = None self.stage_collection_error_handler: Optional[ Callable[[str, Exception], None] ] = None self._lock_depth = 0
class RemoteBASE(object): scheme = "base" path_cls = URLInfo REQUIRES = {} JOBS = 4 * cpu_count() PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_CACHE_TYPES = ["copy"] DEFAULT_VERIFY = False LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 20 TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True CACHE_MODE = None SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} state = StateNoop() def __init__(self, repo, config): self.repo = repo self._check_requires(config) shared = config.get("shared") self._file_mode, self._dir_mode = self.SHARED_MODE_MAP[shared] self.checksum_jobs = ( config.get("checksum_jobs") or (self.repo and self.repo.config["core"].get("checksum_jobs")) or self.CHECKSUM_JOBS) self.verify = config.get("verify", self.DEFAULT_VERIFY) self._dir_info = {} self.cache_types = config.get("type") or copy(self.DEFAULT_CACHE_TYPES) self.cache_type_confirmed = False @classmethod def get_missing_deps(cls): import importlib missing = [] for package, module in cls.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) return missing def _check_requires(self, config): missing = self.get_missing_deps() if not missing: return url = config.get("url", "{}://".format(self.scheme)) msg = ("URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install 'dvc[{}]'\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install 'dvc[all]'\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!" ).format(url, missing, " ".join(missing), self.scheme) raise RemoteMissingDepsError(msg) def __repr__(self): return "{class_name}: '{path_info}'".format( class_name=type(self).__name__, path_info=self.path_info or "No path", ) @classmethod def supported(cls, config): if isinstance(config, (str, bytes)): url = config else: url = config["url"] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def cache(self): return getattr(self.repo.cache, self.scheme) def get_file_checksum(self, path_info): raise NotImplementedError def _calculate_checksums(self, file_infos): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: worker = pbar.wrap_fn(self.get_file_checksum) with ThreadPoolExecutor( max_workers=self.checksum_jobs) as executor: tasks = executor.map(worker, file_infos) checksums = dict(zip(file_infos, tasks)) return checksums def _collect_dir(self, path_info): file_infos = set() for fname in self.walk_files(path_info): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) checksums = {fi: self.state.get(fi) for fi in file_infos} not_in_state = { fi for fi, checksum in checksums.items() if checksum is None } new_checksums = self._calculate_checksums(not_in_state) checksums.update(new_checksums) result = [ { self.PARAM_CHECKSUM: checksums[fi], # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: fi.relative_to(path_info).as_posix(), } for fi in file_infos ] # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def get_dir_checksum(self, path_info): if not self.cache: raise RemoteCacheRequiredError(path_info) dir_info = self._collect_dir(path_info) checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): self.cache.makedirs(new_info.parent) self.cache.move(tmp_info, new_info, mode=self.CACHE_MODE) self.state.save(path_info, checksum) self.state.save(new_info, checksum) return checksum def _get_dir_info_checksum(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.cache.path_info / tmp_fname("") self.cache.upload(from_info, to_info, no_progress_bar=True) checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX return checksum, to_info def get_dir_cache(self, checksum): assert checksum dir_info = self._dir_info.get(checksum) if dir_info: return dir_info try: dir_info = self.load_dir_cache(checksum) except DirCacheError: dir_info = [] self._dir_info[checksum] = dir_info return dir_info def load_dir_cache(self, checksum): path_info = self.checksum_to_path_info(checksum) try: with self.cache.open(path_info, "r") as fobj: d = json.load(fobj) except (ValueError, FileNotFoundError) as exc: raise DirCacheError(checksum) from exc if not isinstance(d, list): logger.error( "dir cache file format error '%s' [skipping the file]", path_info, ) return [] for info in d: # NOTE: here is a BUG, see comment to .as_posix() below relative_path = PathInfo.from_posix(info[self.PARAM_RELPATH]) info[self.PARAM_RELPATH] = relative_path.fspath return d @classmethod def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) def get_checksum(self, path_info): assert path_info.scheme == self.scheme if not self.exists(path_info): return None checksum = self.state.get(path_info) # If we have dir checksum in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_checksum() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (checksum and self.is_dir_checksum(checksum) and not self.exists(self.cache.checksum_to_path_info(checksum))): checksum = None if checksum: return checksum if self.isdir(path_info): checksum = self.get_dir_checksum(path_info) else: checksum = self.get_file_checksum(path_info) if checksum: self.state.save(path_info, checksum) return checksum def save_info(self, path_info): return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. A file is considered changed if: - It doesn't exist on the working directory (was unlinked) - Hash value is not computed (saving a new file) - The hash value stored is different from the given one - There's no file in the cache Args: path_info: dict with path information. checksum: expected hash value for this data. Returns: bool: True if data has changed, False otherwise. """ logger.debug("checking if '%s'('%s') has changed.", path_info, checksum_info) if not self.exists(path_info): logger.debug("'%s' doesn't exist.", path_info) return True checksum = checksum_info.get(self.PARAM_CHECKSUM) if checksum is None: logger.debug("hash value for '%s' is missing.", path_info) return True if self.changed_cache(checksum): logger.debug("cache for '%s'('%s') has changed.", path_info, checksum) return True actual = self.get_checksum(path_info) if checksum != actual: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", checksum, actual, path_info, ) return True logger.debug("'%s' hasn't changed.", path_info) return False def link(self, from_info, to_info): self._link(from_info, to_info, self.cache_types) def _link(self, from_info, to_info, link_types): assert self.isfile(from_info) self.makedirs(to_info.parent) self._try_links(from_info, to_info, link_types) def _verify_link(self, path_info, link_type): if self.cache_type_confirmed: return is_link = getattr(self, "is_{}".format(link_type), None) if is_link and not is_link(path_info): self.remove(path_info) raise DvcException("failed to verify {}".format(link_type)) self.cache_type_confirmed = True @slow_link_guard def _try_links(self, from_info, to_info, link_types): while link_types: link_method = getattr(self, link_types[0]) try: self._do_link(from_info, to_info, link_method) self._verify_link(to_info, link_types[0]) return except DvcException as exc: logger.debug("Cache type '%s' is not supported: %s", link_types[0], exc) del link_types[0] raise DvcException("no possible cache types left to try out.") def _do_link(self, from_info, to_info, link_method): if self.exists(to_info): raise DvcException("Link '{}' already exists!".format(to_info)) link_method(from_info, to_info) logger.debug( "Created '%s': %s -> %s", self.cache_types[0], from_info, to_info, ) def _save_file(self, path_info, checksum, save_link=True): assert checksum cache_info = self.checksum_to_path_info(checksum) if self.changed_cache(checksum): self.move(path_info, cache_info, mode=self.CACHE_MODE) self.link(cache_info, path_info) elif self.iscopy(path_info) and self._cache_is_copy(path_info): # Default relink procedure involves unneeded copy self.unprotect(path_info) else: self.remove(path_info) self.link(cache_info, path_info) if save_link: self.state.save_link(path_info) # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation self.state.save(path_info, checksum) self.state.save(cache_info, checksum) def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" if self.cache_type_confirmed: return self.cache_types[0] == "copy" if set(self.cache_types) <= {"copy"}: return True workspace_file = path_info.with_name("." + uuid()) test_cache_file = self.path_info / ".cache_type_test_file" if not self.exists(test_cache_file): with self.open(test_cache_file, "wb") as fobj: fobj.write(bytes(1)) try: self.link(test_cache_file, workspace_file) finally: self.remove(workspace_file) self.remove(test_cache_file) self.cache_type_confirmed = True return self.cache_types[0] == "copy" def _save_dir(self, path_info, checksum, save_link=True): cache_info = self.checksum_to_path_info(checksum) dir_info = self.get_dir_cache(checksum) for entry in Tqdm(dir_info, desc="Saving " + path_info.name, unit="file"): entry_info = path_info / entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] self._save_file(entry_info, entry_checksum, save_link=False) if save_link: self.state.save_link(path_info) self.state.save(cache_info, checksum) self.state.save(path_info, checksum) def is_empty(self, path_info): return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info): """Return a generator with `PathInfo`s to all the files""" raise NotImplementedError @staticmethod def protect(path_info): pass def save(self, path_info, checksum_info, save_link=True): if path_info.scheme != self.scheme: raise RemoteActionNotImplemented( "save {} -> {}".format(path_info.scheme, self.scheme), self.scheme, ) checksum = checksum_info[self.PARAM_CHECKSUM] self._save(path_info, checksum, save_link) def _save(self, path_info, checksum, save_link=True): to_info = self.checksum_to_path_info(checksum) logger.debug("Saving '%s' to '%s'.", path_info, to_info) if self.isdir(path_info): self._save_dir(path_info, checksum, save_link) return self._save_file(path_info, checksum, save_link) def _handle_transfer_exception(self, from_info, to_info, exception, operation): if isinstance(exception, OSError) and exception.errno == errno.EMFILE: raise exception logger.exception("failed to %s '%s' to '%s'", operation, from_info, to_info) return 1 def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '%s' to '%s'", from_info, to_info) name = name or from_info.name try: self._upload( from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) except Exception as e: return self._handle_transfer_exception(from_info, to_info, e, "upload") return 0 def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) return self._download_file(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, )) with ThreadPoolExecutor(max_workers=self.JOBS) as executor: futures = executor.map(download_files, from_infos, to_infos) return sum(futures) def _download_file(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) try: self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) except Exception as e: return self._handle_transfer_exception(from_info, to_info, e, "download") move(tmp_file, to_info, mode=file_mode) return 0 def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) return open_url(get_url, mode=mode, encoding=encoding) raise RemoteActionNotImplemented("open", self.scheme) def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def move(self, from_info, to_info, mode=None): assert mode is None self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) def exists(self, path_info): raise NotImplementedError def path_to_checksum(self, path): parts = self.path_cls(path).parts[-2:] if not (len(parts) == 2 and parts[0] and len(parts[0]) == 2): raise ValueError("Bad cache file path '{}'".format(path)) return "".join(parts) def checksum_to_path_info(self, checksum): return self.path_info / checksum[0:2] / checksum[2:] def list_cache_paths(self, prefix=None, progress_callback=None): raise NotImplementedError def cache_checksums(self, prefix=None, progress_callback=None): """Iterate over remote cache checksums. If `prefix` is specified, only checksums which begin with `prefix` will be returned. """ for path in self.list_cache_paths(prefix, progress_callback): try: yield self.path_to_checksum(path) except ValueError: logger.debug("'%s' doesn't look like a cache file, skipping", path) def all(self, jobs=None, name=None): """Iterate over all checksums in the remote cache. Checksums will be fetched in parallel threads according to prefix (except for small remotes) and a progress bar will be displayed. """ logger.debug("Fetching all checksums from '{}'".format( name if name else "remote cache")) if not self.CAN_TRAVERSE: return self.cache_checksums() remote_size, remote_checksums = self._estimate_cache_size(name=name) return self._cache_checksums_traverse(remote_size, remote_checksums, jobs, name) def gc(self, named_cache, jobs=None): used = self.extract_used_local_checksums(named_cache) if self.scheme != "": used.update(named_cache.scheme_keys(self.scheme)) removed = False # checksums must be sorted to ensure we always remove .dir files first for checksum in sorted( self.all(jobs, str(self.path_info)), key=self.is_dir_checksum, reverse=True, ): if checksum in used: continue path_info = self.checksum_to_path_info(checksum) if self.is_dir_checksum(checksum): self._remove_unpacked_dir(checksum) self.remove(path_info) removed = True return removed def is_protected(self, path_info): return False def changed_cache_file(self, checksum): """Compare the given checksum with the (corresponding) actual one. - Use `State` as a cache for computed checksums + The entries are invalidated by taking into account the following: * mtime * inode * size * checksum - Remove the file from cache if it doesn't match the actual checksum """ cache_info = self.checksum_to_path_info(checksum) if self.is_protected(cache_info): logger.debug("Assuming '%s' is unchanged since it is read-only", cache_info) return False actual = self.get_checksum(cache_info) logger.debug( "cache '%s' expected '%s' actual '%s'", cache_info, checksum, actual, ) if not checksum or not actual: return True if actual.split(".")[0] == checksum.split(".")[0]: # making cache file read-only so we don't need to check it # next time self.protect(cache_info) return False if self.exists(cache_info): logger.warning("corrupted cache file '%s'.", cache_info) self.remove(cache_info) return True def _changed_dir_cache(self, checksum, path_info=None, filter_info=None): if self.changed_cache_file(checksum): return True if not (path_info and filter_info) and not self._changed_unpacked_dir(checksum): return False for entry in self.get_dir_cache(checksum): entry_checksum = entry[self.PARAM_CHECKSUM] if path_info and filter_info: entry_info = path_info / entry[self.PARAM_RELPATH] if not entry_info.isin_or_eq(filter_info): continue if self.changed_cache_file(entry_checksum): return True if not (path_info and filter_info): self._update_unpacked_dir(checksum) return False def changed_cache(self, checksum, path_info=None, filter_info=None): if self.is_dir_checksum(checksum): return self._changed_dir_cache(checksum, path_info=path_info, filter_info=filter_info) return self.changed_cache_file(checksum) def cache_exists(self, checksums, jobs=None, name=None): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse method: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. Cache entries will be retrieved in parallel threads according to prefix (i.e. entries starting with, "00...", "01...", and so on) and a progress bar will be displayed. - Exists method: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see ssh, local). Which method to use will be automatically determined after estimating the size of the remote cache, and comparing the estimated size with len(checksums). To estimate the size of the remote cache, we fetch a small subset of cache entries (i.e. entries starting with "00..."). Based on the number of entries in that subset, the size of the full cache can be estimated, since the cache is evenly distributed according to checksum. Returns: A list with checksums that were found in the remote """ # Remotes which do not use traverse prefix should override # cache_exists() (see ssh, local) assert self.TRAVERSE_PREFIX_LEN >= 2 if len(checksums) == 1 or not self.CAN_TRAVERSE: return self._cache_object_exists(checksums, jobs, name) checksums = frozenset(checksums) # Max remote size allowed for us to use traverse method remote_size, remote_checksums = self._estimate_cache_size( checksums, name) traverse_pages = remote_size / self.LIST_OBJECT_PAGE_SIZE # For sufficiently large remotes, traverse must be weighted to account # for performance overhead from large lists/sets. # From testing with S3, for remotes with 1M+ files, object_exists is # faster until len(checksums) is at least 10k~100k if remote_size > self.TRAVERSE_THRESHOLD_SIZE: traverse_weight = traverse_pages * self.TRAVERSE_WEIGHT_MULTIPLIER else: traverse_weight = traverse_pages if len(checksums) < traverse_weight: logger.debug( "Large remote ('{}' checksums < '{}' traverse weight), " "using object_exists for remaining checksums".format( len(checksums), traverse_weight)) return list(checksums & remote_checksums) + self._cache_object_exists( checksums - remote_checksums, jobs, name) logger.debug("Querying {} checksums via traverse".format( len(checksums))) remote_checksums = self._cache_checksums_traverse( remote_size, remote_checksums, jobs, name) return list(checksums & set(remote_checksums)) def _checksums_with_limit(self, limit, prefix=None, progress_callback=None): count = 0 for checksum in self.cache_checksums(prefix, progress_callback): yield checksum count += 1 if count > limit: logger.debug( "`cache_checksums()` returned max '{}' checksums, " "skipping remaining results".format(limit)) return def _max_estimation_size(self, checksums): # Max remote size allowed for us to use traverse method return max( self.TRAVERSE_THRESHOLD_SIZE, len(checksums) / self.TRAVERSE_WEIGHT_MULTIPLIER * self.LIST_OBJECT_PAGE_SIZE, ) def _estimate_cache_size(self, checksums=None, name=None): """Estimate remote cache size based on number of entries beginning with "00..." prefix. """ prefix = "0" * self.TRAVERSE_PREFIX_LEN total_prefixes = pow(16, self.TRAVERSE_PREFIX_LEN) if checksums: max_checksums = self._max_estimation_size(checksums) else: max_checksums = None with Tqdm( desc="Estimating size of " + ("cache in '{}'".format(name) if name else "remote cache"), unit="file", ) as pbar: def update(n=1): pbar.update(n * total_prefixes) if max_checksums: checksums = self._checksums_with_limit( max_checksums / total_prefixes, prefix, update) else: checksums = self.cache_checksums(prefix, update) remote_checksums = set(checksums) if remote_checksums: remote_size = total_prefixes * len(remote_checksums) else: remote_size = total_prefixes logger.debug("Estimated remote size: {} files".format(remote_size)) return remote_size, remote_checksums def _cache_checksums_traverse(self, remote_size, remote_checksums, jobs=None, name=None): """Iterate over all checksums in the remote cache. Checksums are fetched in parallel according to prefix, except in cases where the remote size is very small. All checksums from the remote (including any from the size estimation step passed via the `remote_checksums` argument) will be returned. NOTE: For large remotes the list of checksums will be very big(e.g. 100M entries, md5 for each is 32 bytes, so ~3200Mb list) and we don't really need all of it at the same time, so it makes sense to use a generator to gradually iterate over it, without keeping all of it in memory. """ num_pages = remote_size / self.LIST_OBJECT_PAGE_SIZE if num_pages < 256 / self.JOBS: # Fetching prefixes in parallel requires at least 255 more # requests, for small enough remotes it will be faster to fetch # entire cache without splitting it into prefixes. # # NOTE: this ends up re-fetching checksums that were already # fetched during remote size estimation traverse_prefixes = [None] initial = 0 else: yield from remote_checksums initial = len(remote_checksums) traverse_prefixes = ["{:02x}".format(i) for i in range(1, 256)] if self.TRAVERSE_PREFIX_LEN > 2: traverse_prefixes += [ "{0:0{1}x}".format(i, self.TRAVERSE_PREFIX_LEN) for i in range(1, pow(16, self.TRAVERSE_PREFIX_LEN - 2)) ] with Tqdm( desc="Querying " + ("cache in '{}'".format(name) if name else "remote cache"), total=remote_size, initial=initial, unit="file", ) as pbar: def list_with_update(prefix): return list( self.cache_checksums(prefix=prefix, progress_callback=pbar.update)) with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: in_remote = executor.map( list_with_update, traverse_prefixes, ) yield from itertools.chain.from_iterable(in_remote) def _cache_object_exists(self, checksums, jobs=None, name=None): logger.debug("Querying {} checksums via object_exists".format( len(checksums))) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(checksums), unit="file", ) as pbar: def exists_with_progress(path_info): ret = self.exists(path_info) pbar.update_desc(str(path_info)) return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = map(self.checksum_to_path_info, checksums) in_remote = executor.map(exists_with_progress, path_infos) ret = list(itertools.compress(checksums, in_remote)) return ret def already_cached(self, path_info): current = self.get_checksum(path_info) if not current: return False return not self.changed_cache(current) def safe_remove(self, path_info, force=False): if not self.exists(path_info): return if not force and not self.already_cached(path_info): msg = ("file '{}' is going to be removed." " Are you sure you want to proceed?".format(str(path_info))) if not prompt.confirm(msg): raise ConfirmRemoveError(str(path_info)) self.remove(path_info) def _checkout_file(self, path_info, checksum, force, progress_callback=None, relink=False): """The file is changed we need to checkout a new copy""" added, modified = True, False cache_info = self.checksum_to_path_info(checksum) if self.exists(path_info): logger.debug("data '%s' will be replaced.", path_info) self.safe_remove(path_info, force=force) added, modified = False, True self.link(cache_info, path_info) self.state.save_link(path_info) self.state.save(path_info, checksum) if progress_callback: progress_callback(str(path_info)) return added, modified and not relink def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ def _checkout_dir( self, path_info, checksum, force, progress_callback=None, relink=False, filter_info=None, ): added, modified = False, False # Create dir separately so that dir is created # even if there are no files in it if not self.exists(path_info): added = True self.makedirs(path_info) dir_info = self.get_dir_cache(checksum) logger.debug("Linking directory '%s'.", path_info) for entry in dir_info: relative_path = entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info = path_info / relative_path if filter_info and not entry_info.isin_or_eq(filter_info): continue entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum} if relink or self.changed(entry_info, entry_checksum_info): modified = True self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) self.state.save(entry_info, entry_checksum) if progress_callback: progress_callback(str(entry_info)) modified = (self._remove_redundant_files(path_info, dir_info, force) or modified) self.state.save_link(path_info) self.state.save(path_info, checksum) # relink is not modified, assume it as nochange return added, not added and modified and not relink def _remove_redundant_files(self, path_info, dir_info, force): existing_files = set(self.walk_files(path_info)) needed_files = { path_info / entry[self.PARAM_RELPATH] for entry in dir_info } redundant_files = existing_files - needed_files for path in redundant_files: self.safe_remove(path, force) return bool(redundant_files) def checkout( self, path_info, checksum_info, force=False, progress_callback=None, relink=False, filter_info=None, ): if path_info.scheme not in ["local", self.scheme]: raise NotImplementedError checksum = checksum_info.get(self.PARAM_CHECKSUM) failed = None skip = False if not checksum: logger.warning( "No file hash info found for '%s'. " "It won't be created.", path_info, ) self.safe_remove(path_info, force=force) failed = path_info elif not relink and not self.changed(path_info, checksum_info): logger.debug("Data '%s' didn't change.", path_info) skip = True elif self.changed_cache(checksum, path_info=path_info, filter_info=filter_info): logger.warning( "Cache '%s' not found. File '%s' won't be created.", checksum, path_info, ) self.safe_remove(path_info, force=force) failed = path_info if failed or skip: if progress_callback: progress_callback( str(path_info), self.get_files_number(self.path_info, checksum, filter_info), ) if failed: raise CheckoutError([failed]) return logger.debug("Checking out '%s' with cache '%s'.", path_info, checksum) return self._checkout( path_info, checksum, force, progress_callback, relink, filter_info, ) def _checkout( self, path_info, checksum, force=False, progress_callback=None, relink=False, filter_info=None, ): if not self.is_dir_checksum(checksum): return self._checkout_file(path_info, checksum, force, progress_callback, relink) return self._checkout_dir(path_info, checksum, force, progress_callback, relink, filter_info) def get_files_number(self, path_info, checksum, filter_info): from funcy.py3 import ilen if not checksum: return 0 if not self.is_dir_checksum(checksum): return 1 if not filter_info: return len(self.get_dir_cache(checksum)) return ilen( filter_info.isin_or_eq(path_info / entry[self.PARAM_CHECKSUM]) for entry in self.get_dir_cache(checksum)) @staticmethod def unprotect(path_info): pass def _get_unpacked_dir_names(self, checksums): return set() def extract_used_local_checksums(self, named_cache): used = set(named_cache.scheme_keys("local")) unpacked = self._get_unpacked_dir_names(used) return used | unpacked def _changed_unpacked_dir(self, checksum): return True def _update_unpacked_dir(self, checksum): pass def _remove_unpacked_dir(self, checksum): pass
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.tree.local import LocalRemoteTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = scm.get_tree(rev, use_dvcignore=True, dvcignore_root=self.root_dir) self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = LocalRemoteTree( self, {"url": self.root_dir}, use_dvcignore=True, dvcignore_root=self.root_dir, ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore()
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.cache import Cache from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs.local import LocalFileSystem from dvc.lock import LockNoop, make_lock from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = { "repo_factory": repo_factory, } self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized) fs_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self._fs = scm.get_fs(rev, **fs_kwargs) else: self._fs = LocalFileSystem(self, {"url": self.root_dir}, **fs_kwargs) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized self._scm = scm # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0
def state(self): from dvc.state import StateNoop return self.repo.state if self.repo else StateNoop()
class BaseRemoteTree: scheme = "base" REQUIRES = {} PATH_CLS = URLInfo JOBS = 4 * cpu_count() PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" HASH_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_VERIFY = False LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 5 TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True CACHE_MODE = None SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} CHECKSUM_DIR_SUFFIX = ".dir" state = StateNoop() def __init__(self, repo, config): self.repo = repo self.config = config self._check_requires(config) shared = config.get("shared") self._file_mode, self._dir_mode = self.SHARED_MODE_MAP[shared] self.hash_jobs = (config.get("hash_jobs") or (self.repo and self.repo.config["core"].get("hash_jobs")) or self.HASH_JOBS) self.verify = config.get("verify", self.DEFAULT_VERIFY) @classmethod def get_missing_deps(cls): import importlib missing = [] for package, module in cls.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) return missing def _check_requires(self, config): missing = self.get_missing_deps() if not missing: return url = config.get("url", f"{self.scheme}://") msg = ("URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install 'dvc[{}]'\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install 'dvc[all]'\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!" ).format(url, missing, " ".join(missing), self.scheme) raise RemoteMissingDepsError(msg) @classmethod def supported(cls, config): if isinstance(config, (str, bytes)): url = config else: url = config["url"] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def file_mode(self): return self._file_mode @property def dir_mode(self): return self._dir_mode @property def cache(self): return getattr(self.repo.cache, self.scheme) def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) return open_url(get_url, mode=mode, encoding=encoding) raise RemoteActionNotImplemented("open", self.scheme) def exists(self, path_info): raise NotImplementedError def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info, **kwargs): """Return a generator with `PathInfo`s to all the files""" raise NotImplementedError def is_empty(self, path_info): return False def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ def move(self, from_info, to_info, mode=None): assert mode is None self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def copy_fobj(self, fobj, to_info): raise RemoteActionNotImplemented("copy_fobj", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) @staticmethod def protect(path_info): pass def is_protected(self, path_info): return False @staticmethod def unprotect(path_info): pass @classmethod def is_dir_hash(cls, hash_): if not hash_: return False return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) def get_hash(self, path_info, tree=None, **kwargs): assert isinstance(path_info, str) or path_info.scheme == self.scheme if not tree: tree = self if not tree.exists(path_info): return None if tree == self: hash_ = self.state.get(path_info) else: hash_ = None # If we have dir hash in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (hash_ and self.is_dir_hash(hash_) and not tree.exists(self.cache.hash_to_path_info(hash_))): hash_ = None if hash_: return hash_ if tree.isdir(path_info): hash_ = self.get_dir_hash(path_info, tree, **kwargs) else: hash_ = tree.get_file_hash(path_info) if hash_ and self.exists(path_info): self.state.save(path_info, hash_) return hash_ def get_file_hash(self, path_info): raise NotImplementedError def get_dir_hash(self, path_info, tree, **kwargs): if not self.cache: raise RemoteCacheRequiredError(path_info) dir_info = self._collect_dir(path_info, tree, **kwargs) return self._save_dir_info(dir_info, path_info) def hash_to_path_info(self, hash_): return self.path_info / hash_[0:2] / hash_[2:] def path_to_hash(self, path): parts = self.PATH_CLS(path).parts[-2:] if not (len(parts) == 2 and parts[0] and len(parts[0]) == 2): raise ValueError(f"Bad cache file path '{path}'") return "".join(parts) def save_info(self, path_info, tree=None, **kwargs): return { self.PARAM_CHECKSUM: self.get_hash(path_info, tree=tree, **kwargs) } @staticmethod def _calculate_hashes(file_infos, tree): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: worker = pbar.wrap_fn(tree.get_file_hash) with ThreadPoolExecutor(max_workers=tree.hash_jobs) as executor: tasks = executor.map(worker, file_infos) hashes = dict(zip(file_infos, tasks)) return hashes def _collect_dir(self, path_info, tree, **kwargs): file_infos = set() for fname in tree.walk_files(path_info, **kwargs): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) hashes = {fi: self.state.get(fi) for fi in file_infos} not_in_state = {fi for fi, hash_ in hashes.items() if hash_ is None} new_hashes = self._calculate_hashes(not_in_state, tree) hashes.update(new_hashes) result = [ { self.PARAM_CHECKSUM: hashes[fi], # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: fi.relative_to(path_info).as_posix(), } for fi in file_infos ] # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def _save_dir_info(self, dir_info, path_info): hash_, tmp_info = self._get_dir_info_hash(dir_info) new_info = self.cache.hash_to_path_info(hash_) if self.cache.changed_cache_file(hash_): self.cache.tree.makedirs(new_info.parent) self.cache.tree.move(tmp_info, new_info, mode=self.cache.CACHE_MODE) if self.exists(path_info): self.state.save(path_info, hash_) self.state.save(new_info, hash_) return hash_ def _get_dir_info_hash(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) tree = self.cache.tree from_info = PathInfo(tmp) to_info = tree.path_info / tmp_fname("") tree.upload(from_info, to_info, no_progress_bar=True) hash_ = tree.get_file_hash(to_info) + self.CHECKSUM_DIR_SUFFIX return hash_, to_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '%s' to '%s'", from_info, to_info) name = name or from_info.name self._upload( from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) return self._download_file(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, )) with ThreadPoolExecutor(max_workers=self.JOBS) as executor: futures = [ executor.submit(download_files, from_info, to_info) for from_info, to_info in zip(from_infos, to_infos) ] # NOTE: unlike pulling/fetching cache, where we need to # download everything we can, not raising an error here might # turn very ugly, as the user might think that he has # downloaded a complete directory, while having a partial one, # which might cause unexpected results in his pipeline. for future in as_completed(futures): # NOTE: executor won't let us raise until all futures that # it has are finished, so we need to cancel them ourselves # before re-raising. exc = future.exception() if exc: for entry in futures: entry.cancel() raise exc def _download_file(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) move(tmp_file, to_info, mode=file_mode) def list_paths(self, prefix=None, progress_callback=None): if prefix: if len(prefix) > 2: path_info = self.path_info / prefix[:2] / prefix[2:] else: path_info = self.path_info / prefix[:2] else: path_info = self.path_info if progress_callback: for file_info in self.walk_files(path_info): progress_callback() yield file_info.path else: yield from self.walk_files(path_info) def list_hashes(self, prefix=None, progress_callback=None): """Iterate over hashes in this tree. If `prefix` is specified, only hashes which begin with `prefix` will be returned. """ for path in self.list_paths(prefix, progress_callback): try: yield self.path_to_hash(path) except ValueError: logger.debug("'%s' doesn't look like a cache file, skipping", path) def all(self, jobs=None, name=None): """Iterate over all hashes in this tree. Hashes will be fetched in parallel threads according to prefix (except for small remotes) and a progress bar will be displayed. """ logger.debug("Fetching all hashes from '{}'".format( name if name else "remote cache")) if not self.CAN_TRAVERSE: return self.list_hashes() remote_size, remote_hashes = self.estimate_remote_size(name=name) return self.list_hashes_traverse(remote_size, remote_hashes, jobs, name) def _hashes_with_limit(self, limit, prefix=None, progress_callback=None): count = 0 for hash_ in self.list_hashes(prefix, progress_callback): yield hash_ count += 1 if count > limit: logger.debug("`list_hashes()` returned max '{}' hashes, " "skipping remaining results".format(limit)) return def _max_estimation_size(self, hashes): # Max remote size allowed for us to use traverse method return max( self.TRAVERSE_THRESHOLD_SIZE, len(hashes) / self.TRAVERSE_WEIGHT_MULTIPLIER * self.LIST_OBJECT_PAGE_SIZE, ) def estimate_remote_size(self, hashes=None, name=None): """Estimate tree size based on number of entries beginning with "00..." prefix. """ prefix = "0" * self.TRAVERSE_PREFIX_LEN total_prefixes = pow(16, self.TRAVERSE_PREFIX_LEN) if hashes: max_hashes = self._max_estimation_size(hashes) else: max_hashes = None with Tqdm( desc="Estimating size of " + (f"cache in '{name}'" if name else "remote cache"), unit="file", ) as pbar: def update(n=1): pbar.update(n * total_prefixes) if max_hashes: hashes = self._hashes_with_limit(max_hashes / total_prefixes, prefix, update) else: hashes = self.list_hashes(prefix, update) remote_hashes = set(hashes) if remote_hashes: remote_size = total_prefixes * len(remote_hashes) else: remote_size = total_prefixes logger.debug(f"Estimated remote size: {remote_size} files") return remote_size, remote_hashes def list_hashes_traverse(self, remote_size, remote_hashes, jobs=None, name=None): """Iterate over all hashes found in this tree. Hashes are fetched in parallel according to prefix, except in cases where the remote size is very small. All hashes from the remote (including any from the size estimation step passed via the `remote_hashes` argument) will be returned. NOTE: For large remotes the list of hashes will be very big(e.g. 100M entries, md5 for each is 32 bytes, so ~3200Mb list) and we don't really need all of it at the same time, so it makes sense to use a generator to gradually iterate over it, without keeping all of it in memory. """ num_pages = remote_size / self.LIST_OBJECT_PAGE_SIZE if num_pages < 256 / self.JOBS: # Fetching prefixes in parallel requires at least 255 more # requests, for small enough remotes it will be faster to fetch # entire cache without splitting it into prefixes. # # NOTE: this ends up re-fetching hashes that were already # fetched during remote size estimation traverse_prefixes = [None] initial = 0 else: yield from remote_hashes initial = len(remote_hashes) traverse_prefixes = [f"{i:02x}" for i in range(1, 256)] if self.TRAVERSE_PREFIX_LEN > 2: traverse_prefixes += [ "{0:0{1}x}".format(i, self.TRAVERSE_PREFIX_LEN) for i in range(1, pow(16, self.TRAVERSE_PREFIX_LEN - 2)) ] with Tqdm( desc="Querying " + (f"cache in '{name}'" if name else "remote cache"), total=remote_size, initial=initial, unit="file", ) as pbar: def list_with_update(prefix): return list( self.list_hashes(prefix=prefix, progress_callback=pbar.update)) with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: in_remote = executor.map( list_with_update, traverse_prefixes, ) yield from itertools.chain.from_iterable(in_remote) def list_hashes_exists(self, hashes, jobs=None, name=None): """Return list of the specified hashes which exist in this tree. Hashes will be queried individually. """ logger.debug("Querying {} hashes via object_exists".format( len(hashes))) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(hashes), unit="file", ) as pbar: def exists_with_progress(path_info): ret = self.exists(path_info) pbar.update_msg(str(path_info)) return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = map(self.hash_to_path_info, hashes) in_remote = executor.map(exists_with_progress, path_infos) ret = list(itertools.compress(hashes, in_remote)) return ret def _remove_unpacked_dir(self, hash_): pass
class BaseTree: scheme = "base" REQUIRES: ClassVar[Dict[str, str]] = {} PATH_CLS = URLInfo # type: Any JOBS = 4 * cpu_count() CHECKSUM_DIR_SUFFIX = ".dir" HASH_JOBS = max(1, min(4, cpu_count() // 2)) LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 5 TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True # Needed for some providers, and http open() CHUNK_SIZE = 64 * 1024 * 1024 # 64 MiB PARAM_CHECKSUM: ClassVar[Optional[str]] = None state = StateNoop() def __init__(self, repo, config): self.repo = repo self.config = config self._check_requires() self.path_info = None @cached_property def jobs(self): return (self.config.get("jobs") or (self.repo and self.repo.config["core"].get("jobs")) or self.JOBS) @cached_property def hash_jobs(self): return (self.config.get("checksum_jobs") or (self.repo and self.repo.config["core"].get("checksum_jobs")) or self.HASH_JOBS) @classmethod def get_missing_deps(cls): import importlib missing = [] for package, module in cls.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) return missing def _check_requires(self): from ..scheme import Schemes from ..utils import format_link from ..utils.pkg import PKG missing = self.get_missing_deps() if not missing: return url = self.config.get("url", f"{self.scheme}://") scheme = self.scheme if scheme == Schemes.WEBDAVS: scheme = Schemes.WEBDAV by_pkg = { "pip": f"pip install 'dvc[{scheme}]'", "conda": f"conda install -c conda-forge dvc-{scheme}", } cmd = by_pkg.get(PKG) if cmd: link = format_link("https://dvc.org/doc/install") hint = (f"To install dvc with those dependencies, run:\n" "\n" f"\t{cmd}\n" "\n" f"See {link} for more info.") else: link = format_link("https://github.com/iterative/dvc/issues") hint = f"Please report this bug to {link}. Thank you!" raise RemoteMissingDepsError( f"URL '{url}' is supported but requires these missing " f"dependencies: {missing}. {hint}") @classmethod def supported(cls, config): if isinstance(config, (str, bytes)): url = config else: url = config["url"] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def cache(self): return getattr(self.repo.cache, self.scheme) def open(self, path_info, mode: str = "r", encoding: str = None, **kwargs): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) return open_url(get_url, mode=mode, encoding=encoding, **kwargs) raise RemoteActionNotImplemented("open", self.scheme) def exists(self, path_info, use_dvcignore=True) -> bool: raise NotImplementedError # pylint: disable=unused-argument def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def isexec(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between executable and non-executable file. """ return False def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info, **kwargs): """Return a generator with `PathInfo`s to all the files. Optional kwargs: prefix (bool): If true `path_info` will be treated as a prefix rather than directory path. """ raise NotImplementedError def is_empty(self, path_info): return False def getsize(self, path_info): return None def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ def move(self, from_info, to_info): self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) # pylint: enable=unused-argument @classmethod def is_dir_hash(cls, hash_): if not hash_: return False return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) @use_state def get_hash(self, path_info, **kwargs): assert path_info and (isinstance(path_info, str) or path_info.scheme == self.scheme) if not self.exists(path_info): return None # pylint: disable=assignment-from-none hash_info = self.state.get(path_info) # If we have dir hash in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (hash_info and hash_info.isdir and not self.cache.tree.exists( self.cache.tree.hash_to_path_info(hash_info.value))): hash_info = None if hash_info: assert hash_info.name == self.PARAM_CHECKSUM if hash_info.isdir: self.cache.set_dir_info(hash_info) return hash_info if self.isdir(path_info): hash_info = self.get_dir_hash(path_info, **kwargs) else: hash_info = self.get_file_hash(path_info) if hash_info and self.exists(path_info): self.state.save(path_info, hash_info) return hash_info def get_file_hash(self, path_info): raise NotImplementedError def hash_to_path_info(self, hash_): return self.path_info / hash_[0:2] / hash_[2:] def _calculate_hashes(self, file_infos): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: worker = pbar.wrap_fn(self.get_file_hash) with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: hash_infos = executor.map(worker, file_infos) return dict(zip(file_infos, hash_infos)) def _collect_dir(self, path_info, **kwargs): file_infos = set() for fname in self.walk_files(path_info, **kwargs): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) hash_infos = {fi: self.state.get(fi) for fi in file_infos} not_in_state = {fi for fi, hi in hash_infos.items() if hi is None} new_hash_infos = self._calculate_hashes(not_in_state) hash_infos.update(new_hash_infos) dir_info = DirInfo() for fi, hi in hash_infos.items(): # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix dir_info.trie[fi.relative_to(path_info).parts] = hi return dir_info @use_state def get_dir_hash(self, path_info, **kwargs): dir_info = self._collect_dir(path_info, **kwargs) hash_info = self.repo.cache.local.save_dir_info(dir_info) hash_info.size = dir_info.size return hash_info def upload( self, from_info, to_info, name=None, no_progress_bar=False, ): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '%s' to '%s'", from_info, to_info) name = name or from_info.name self._upload( # noqa, pylint: disable=no-member from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): if not hasattr(self, "_upload_fobj"): raise RemoteActionNotImplemented("upload_fobj", self.scheme) with Tqdm.wrapattr(fobj, "read", disable=no_progress_bar, bytes=True, **pbar_args) as wrapped: self._upload_fobj(wrapped, to_info) # pylint: disable=no-member def download( self, from_info, to_info, name=None, no_progress_bar=False, jobs=None, **kwargs, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir( from_info, to_info, name, no_progress_bar, jobs, **kwargs, ) return self._download_file( from_info, to_info, name, no_progress_bar, ) def _download_dir( self, from_info, to_info, name, no_progress_bar, jobs, **kwargs, ): from_infos = list(self.walk_files(from_info, **kwargs)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, )) max_workers = jobs or self.jobs with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(download_files, from_info, to_info) for from_info, to_info in zip(from_infos, to_infos) ] # NOTE: unlike pulling/fetching cache, where we need to # download everything we can, not raising an error here might # turn very ugly, as the user might think that he has # downloaded a complete directory, while having a partial one, # which might cause unexpected results in his pipeline. for future in as_completed(futures): # NOTE: executor won't let us raise until all futures that # it has are finished, so we need to cancel them ourselves # before re-raising. exc = future.exception() if exc: for entry in futures: entry.cancel() raise exc def _download_file( self, from_info, to_info, name, no_progress_bar, ): makedirs(to_info.parent, exist_ok=True) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download( # noqa, pylint: disable=no-member from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) move(tmp_file, to_info)
class BaseFileSystem: scheme = "base" REQUIRES: ClassVar[Dict[str, str]] = {} PATH_CLS = URLInfo # type: Any JOBS = 4 * cpu_count() CHECKSUM_DIR_SUFFIX = ".dir" HASH_JOBS = max(1, min(4, cpu_count() // 2)) LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 5 TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True # Needed for some providers, and http open() CHUNK_SIZE = 64 * 1024 * 1024 # 64 MiB PARAM_CHECKSUM: ClassVar[Optional[str]] = None DETAIL_FIELDS: FrozenSet[str] = frozenset() state = StateNoop() def __init__(self, repo, config): self.repo = repo self.config = config self._check_requires() self.path_info = None @cached_property def jobs(self): return (self.config.get("jobs") or (self.repo and self.repo.config["core"].get("jobs")) or self.JOBS) @cached_property def hash_jobs(self): return (self.config.get("checksum_jobs") or (self.repo and self.repo.config["core"].get("checksum_jobs")) or self.HASH_JOBS) @classmethod def get_missing_deps(cls): import importlib missing = [] for package, module in cls.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) return missing def _check_requires(self): from ..scheme import Schemes from ..utils import format_link from ..utils.pkg import PKG missing = self.get_missing_deps() if not missing: return url = self.config.get("url", f"{self.scheme}://") scheme = self.scheme if scheme == Schemes.WEBDAVS: scheme = Schemes.WEBDAV by_pkg = { "pip": f"pip install 'dvc[{scheme}]'", "conda": f"conda install -c conda-forge dvc-{scheme}", } cmd = by_pkg.get(PKG) if cmd: link = format_link("https://dvc.org/doc/install") hint = (f"To install dvc with those dependencies, run:\n" "\n" f"\t{cmd}\n" "\n" f"See {link} for more info.") else: link = format_link("https://github.com/iterative/dvc/issues") hint = f"Please report this bug to {link}. Thank you!" raise RemoteMissingDepsError( f"URL '{url}' is supported but requires these missing " f"dependencies: {missing}. {hint}") @property def cache(self): return getattr(self.repo.cache, self.scheme) def open(self, path_info, mode: str = "r", encoding: str = None, **kwargs): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) return open_url(get_url, mode=mode, encoding=encoding, **kwargs) raise RemoteActionNotImplemented("open", self.scheme) def exists(self, path_info, use_dvcignore=True) -> bool: raise NotImplementedError # pylint: disable=unused-argument def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def isexec(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between executable and non-executable file. """ return False def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info, **kwargs): """Return a generator with `PathInfo`s to all the files. Optional kwargs: prefix (bool): If true `path_info` will be treated as a prefix rather than directory path. """ raise NotImplementedError def ls(self, path_info, detail=False, **kwargs): raise RemoteActionNotImplemented("ls", self.scheme) def is_empty(self, path_info): return False def info(self, path_info): raise NotImplementedError def getsize(self, path_info): return self.info(path_info).get("size") def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ def move(self, from_info, to_info): self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) # pylint: enable=unused-argument @classmethod def is_dir_hash(cls, hash_): if not hash_: return False return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) def upload( self, from_info, to_info, name=None, no_progress_bar=False, ): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '%s' to '%s'", from_info, to_info) name = name or from_info.name self._upload( # noqa, pylint: disable=no-member from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): if not hasattr(self, "_upload_fobj"): raise RemoteActionNotImplemented("upload_fobj", self.scheme) with Tqdm.wrapattr(fobj, "read", disable=no_progress_bar, bytes=True, **pbar_args) as wrapped: self._upload_fobj(wrapped, to_info) # pylint: disable=no-member def download( self, from_info, to_info, name=None, no_progress_bar=False, jobs=None, **kwargs, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir( from_info, to_info, name, no_progress_bar, jobs, **kwargs, ) return self._download_file( from_info, to_info, name, no_progress_bar, ) def _download_dir( self, from_info, to_info, name, no_progress_bar, jobs, **kwargs, ): from_infos = list(self.walk_files(from_info, **kwargs)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, )) max_workers = jobs or self.jobs with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(download_files, from_info, to_info) for from_info, to_info in zip(from_infos, to_infos) ] # NOTE: unlike pulling/fetching cache, where we need to # download everything we can, not raising an error here might # turn very ugly, as the user might think that he has # downloaded a complete directory, while having a partial one, # which might cause unexpected results in his pipeline. for future in as_completed(futures): # NOTE: executor won't let us raise until all futures that # it has are finished, so we need to cancel them ourselves # before re-raising. exc = future.exception() if exc: for entry in futures: entry.cancel() raise exc def _download_file( self, from_info, to_info, name, no_progress_bar, ): makedirs(to_info.parent, exist_ok=True) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download( # noqa, pylint: disable=no-member from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) move(tmp_file, to_info)
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self)
class RemoteBASE(object): scheme = "base" path_cls = URLInfo REQUIRES = {} JOBS = 4 * cpu_count() PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2)) state = StateNoop() def __init__(self, repo, config): self.repo = repo deps_ok = all(self.REQUIRES.values()) if not deps_ok: missing = [k for k, v in self.REQUIRES.items() if v is None] url = config.get(Config.SECTION_REMOTE_URL, "{}://".format(self.scheme)) msg = ("URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install dvc[{}]\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install dvc[all]\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!" ).format(url, missing, " ".join(missing), self.scheme) raise RemoteMissingDepsError(msg) core = config.get(Config.SECTION_CORE, {}) self.checksum_jobs = core.get(Config.SECTION_CORE_CHECKSUM_JOBS, self.CHECKSUM_JOBS) self.protected = False self.no_traverse = config.get(Config.SECTION_REMOTE_NO_TRAVERSE) self._dir_info = {} def __repr__(self): return "{class_name}: '{path_info}'".format( class_name=type(self).__name__, path_info=self.path_info or "No path", ) @classmethod def supported(cls, config): if isinstance(config, basestring): url = config else: url = config[Config.SECTION_REMOTE_URL] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def cache(self): return getattr(self.repo.cache, self.scheme) def get_file_checksum(self, path_info): raise NotImplementedError def _calculate_checksums(self, file_infos): file_infos = list(file_infos) with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: tasks = executor.map(self.get_file_checksum, file_infos) if len(file_infos) > LARGE_DIR_SIZE: msg = ("Computing md5 for a large number of files. " "This is only done once.") logger.info(msg) tasks = progress(tasks, total=len(file_infos)) checksums = { file_infos[index]: task for index, task in enumerate(tasks) } return checksums def _collect_dir(self, path_info): file_infos = set() for root, _dirs, files in self.walk(path_info): if DvcIgnore.DVCIGNORE_FILE in files: raise DvcIgnoreInCollectedDirError(root) file_infos.update(path_info / root / fname for fname in files) checksums = {fi: self.state.get(fi) for fi in file_infos} not_in_state = { fi for fi, checksum in checksums.items() if checksum is None } new_checksums = self._calculate_checksums(not_in_state) checksums.update(new_checksums) result = [ { self.PARAM_CHECKSUM: checksums[fi], # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: fi.relative_to(path_info).as_posix(), } for fi in file_infos ] # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def get_dir_checksum(self, path_info): dir_info = self._collect_dir(path_info) checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): self.cache.makedirs(new_info.parent) self.cache.move(tmp_info, new_info) self.state.save(path_info, checksum) self.state.save(new_info, checksum) return checksum def _get_dir_info_checksum(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.cache.path_info / tmp_fname("") self.cache.upload(from_info, to_info, no_progress_bar=True) checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX return checksum, to_info def get_dir_cache(self, checksum): assert checksum dir_info = self._dir_info.get(checksum) if dir_info: return dir_info dir_info = self.load_dir_cache(checksum) self._dir_info[checksum] = dir_info return dir_info def load_dir_cache(self, checksum): path_info = self.checksum_to_path_info(checksum) fobj = tempfile.NamedTemporaryFile(delete=False) path = fobj.name to_info = PathInfo(path) self.cache.download(path_info, to_info, no_progress_bar=True) try: with open(path, "r") as fobj: d = json.load(fobj) except ValueError: logger.exception("Failed to load dir cache '{}'".format(path_info)) return [] finally: os.unlink(path) if not isinstance(d, list): msg = "dir cache file format error '{}' [skipping the file]" logger.error(msg.format(relpath(path))) return [] for info in d: # NOTE: here is a BUG, see comment to .as_posix() below relative_path = PathInfo.from_posix(info[self.PARAM_RELPATH]) info[self.PARAM_RELPATH] = relative_path.fspath return d @classmethod def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) def get_checksum(self, path_info): if not self.exists(path_info): return None checksum = self.state.get(path_info) # If we have dir checksum in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_checksum() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (checksum and self.is_dir_checksum(checksum) and not self.exists(self.cache.checksum_to_path_info(checksum))): checksum = None if checksum: return checksum if self.isdir(path_info): checksum = self.get_dir_checksum(path_info) else: checksum = self.get_file_checksum(path_info) if checksum: self.state.save(path_info, checksum) return checksum def save_info(self, path_info): assert path_info.scheme == self.scheme return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. A file is considered changed if: - It doesn't exist on the working directory (was unlinked) - Checksum is not computed (saving a new file) - The checkusm stored in the State is different from the given one - There's no file in the cache Args: path_info: dict with path information. checksum: expected checksum for this data. Returns: bool: True if data has changed, False otherwise. """ logger.debug("checking if '{}'('{}') has changed.".format( path_info, checksum_info)) if not self.exists(path_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True checksum = checksum_info.get(self.PARAM_CHECKSUM) if checksum is None: logger.debug("checksum for '{}' is missing.".format(path_info)) return True if self.changed_cache(checksum): logger.debug("cache for '{}'('{}') has changed.".format( path_info, checksum)) return True actual = self.save_info(path_info)[self.PARAM_CHECKSUM] if checksum != actual: logger.debug( "checksum '{}'(actual '{}') for '{}' has changed.".format( checksum, actual, path_info)) return True logger.debug("'{}' hasn't changed.".format(path_info)) return False def link(self, from_info, to_info): self.copy(from_info, to_info) def _save_file(self, path_info, checksum, save_link=True): assert checksum cache_info = self.checksum_to_path_info(checksum) if self.changed_cache(checksum): self.move(path_info, cache_info) else: self.remove(path_info) self.link(cache_info, path_info) if save_link: self.state.save_link(path_info) # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation self.state.save(path_info, checksum) self.state.save(cache_info, checksum) def _save_dir(self, path_info, checksum): cache_info = self.checksum_to_path_info(checksum) dir_info = self.get_dir_cache(checksum) for entry in dir_info: entry_info = path_info / entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] self._save_file(entry_info, entry_checksum, save_link=False) self.state.save_link(path_info) self.state.save(cache_info, checksum) self.state.save(path_info, checksum) def is_empty(self, path_info): return False def isfile(self, path_info): raise NotImplementedError def isdir(self, path_info): return False def walk(self, path_info): raise NotImplementedError @staticmethod def protect(path_info): pass def save(self, path_info, checksum_info): if path_info.scheme != self.scheme: raise RemoteActionNotImplemented( "save {} -> {}".format(path_info.scheme, self.scheme), self.scheme, ) checksum = checksum_info[self.PARAM_CHECKSUM] if not self.changed_cache(checksum): self._checkout(path_info, checksum) return self._save(path_info, checksum) def _save(self, path_info, checksum): to_info = self.checksum_to_path_info(checksum) logger.info("Saving '{}' to '{}'.".format(path_info, to_info)) if self.isdir(path_info): self._save_dir(path_info, checksum) return self._save_file(path_info, checksum) def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) name = name or from_info.name if not no_progress_bar: progress.update_target(name, 0, None) try: self._upload( from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) except Exception: msg = "failed to upload '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail if not no_progress_bar: progress.finish_target(name) return 0 def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name if not no_progress_bar: # real progress is not always available, # lets at least show start and finish progress.update_target(name, 0, None) makedirs(to_info.parent, exist_ok=True, mode=dir_mode) tmp_file = tmp_fname(to_info) try: self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) except Exception: msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail move(tmp_file, to_info, mode=file_mode) if not no_progress_bar: progress.finish_target(name) return 0 def open(self, path_info, mode="r", encoding=None): raise RemoteActionNotImplemented("open", self.scheme) def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def move(self, from_info, to_info): self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def exists(self, path_info): raise NotImplementedError def path_to_checksum(self, path): return "".join(self.path_cls(path).parts[-2:]) def checksum_to_path_info(self, checksum): return self.path_info / checksum[0:2] / checksum[2:] def list_cache_paths(self): raise NotImplementedError def all(self): # NOTE: The list might be way too big(e.g. 100M entries, md5 for each # is 32 bytes, so ~3200Mb list) and we don't really need all of it at # the same time, so it makes sense to use a generator to gradually # iterate over it, without keeping all of it in memory. return (self.path_to_checksum(path) for path in self.list_cache_paths()) def gc(self, cinfos): used = self.extract_used_local_checksums(cinfos) if self.scheme != "": used |= { info[self.PARAM_CHECKSUM] for info in cinfos.get(self.scheme, []) } removed = False for checksum in self.all(): if checksum in used: continue path_info = self.checksum_to_path_info(checksum) self.remove(path_info) removed = True return removed def changed_cache_file(self, checksum): """Compare the given checksum with the (corresponding) actual one. - Use `State` as a cache for computed checksums + The entries are invalidated by taking into account the following: * mtime * inode * size * checksum - Remove the file from cache if it doesn't match the actual checksum """ cache_info = self.checksum_to_path_info(checksum) actual = self.get_checksum(cache_info) logger.debug("cache '{}' expected '{}' actual '{}'".format( str(cache_info), checksum, actual)) if not checksum or not actual: return True if actual.split(".")[0] == checksum.split(".")[0]: return False if self.exists(cache_info): logger.warning("corrupted cache file '{}'.".format(cache_info)) self.remove(cache_info) return True def _changed_dir_cache(self, checksum): if self.changed_cache_file(checksum): return True if not self._changed_unpacked_dir(checksum): return False for entry in self.get_dir_cache(checksum): entry_checksum = entry[self.PARAM_CHECKSUM] if self.changed_cache_file(entry_checksum): return True self._update_unpacked_dir(checksum) return False def changed_cache(self, checksum): if self.is_dir_checksum(checksum): return self._changed_dir_cache(checksum) return self.changed_cache_file(checksum) def cache_exists(self, checksums, jobs=None): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. - No traverse: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see http, local). Returns: A list with checksums that were found in the remote """ progress_callback = ProgressCallback(len(checksums)) def exists_with_progress(chunks): return self.batch_exists(chunks, callback=progress_callback) if self.no_traverse and hasattr(self, "batch_exists"): with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) progress_callback.finish("") return ret return list(set(checksums) & set(self.all())) def already_cached(self, path_info): current = self.get_checksum(path_info) if not current: return False return not self.changed_cache(current) def safe_remove(self, path_info, force=False): if not self.exists(path_info): return if not force and not self.already_cached(path_info): msg = ("file '{}' is going to be removed." " Are you sure you want to proceed?".format(str(path_info))) if not prompt.confirm(msg): raise ConfirmRemoveError(str(path_info)) self.remove(path_info) def _checkout_file(self, path_info, checksum, force, progress_callback=None): cache_info = self.checksum_to_path_info(checksum) if self.exists(path_info): msg = "data '{}' exists. Removing before checkout." logger.warning(msg.format(str(path_info))) self.safe_remove(path_info, force=force) self.link(cache_info, path_info) self.state.save_link(path_info) self.state.save(path_info, checksum) if progress_callback: progress_callback.update(str(path_info)) def makedirs(self, path_info): raise NotImplementedError def _checkout_dir(self, path_info, checksum, force, progress_callback=None): # Create dir separately so that dir is created # even if there are no files in it if not self.exists(path_info): self.makedirs(path_info) dir_info = self.get_dir_cache(checksum) logger.debug("Linking directory '{}'.".format(path_info)) for entry in dir_info: relative_path = entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] entry_cache_info = self.checksum_to_path_info(entry_checksum) entry_info = path_info / relative_path entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum} if self.changed(entry_info, entry_checksum_info): if self.exists(entry_info): self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) self.state.save(entry_info, entry_checksum) if progress_callback: progress_callback.update(str(entry_info)) self._remove_redundant_files(path_info, dir_info, force) self.state.save_link(path_info) self.state.save(path_info, checksum) def _remove_redundant_files(self, path_info, dir_info, force): existing_files = set(path_info / root / fname for root, _, files in self.walk(path_info) for fname in files) needed_files = { path_info / entry[self.PARAM_RELPATH] for entry in dir_info } for path in existing_files - needed_files: self.safe_remove(path, force) def checkout(self, path_info, checksum_info, force=False, progress_callback=None): if path_info.scheme not in ["local", self.scheme]: raise NotImplementedError checksum = checksum_info.get(self.PARAM_CHECKSUM) if not checksum: logger.warning("No checksum info found for '{}'. " "It won't be created.".format(str(path_info))) self.safe_remove(path_info, force=force) return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.debug(msg.format(str(path_info))) return if self.changed_cache(checksum): msg = "Cache '{}' not found. File '{}' won't be created." logger.warning(msg.format(checksum, str(path_info))) self.safe_remove(path_info, force=force) return msg = "Checking out '{}' with cache '{}'." logger.debug(msg.format(str(path_info), checksum)) self._checkout(path_info, checksum, force, progress_callback) def _checkout(self, path_info, checksum, force=False, progress_callback=None): if not self.is_dir_checksum(checksum): return self._checkout_file(path_info, checksum, force, progress_callback=progress_callback) return self._checkout_dir(path_info, checksum, force, progress_callback=progress_callback) @staticmethod def unprotect(path_info): pass def _get_unpacked_dir_names(self, checksums): return set() def extract_used_local_checksums(self, cinfos): from dvc.remote import RemoteLOCAL used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]} unpacked = self._get_unpacked_dir_names(used) return used | unpacked def _changed_unpacked_dir(self, checksum): return True def _update_unpacked_dir(self, checksum): pass
class BaseTree: scheme = "base" REQUIRES = {} PATH_CLS = URLInfo JOBS = 4 * cpu_count() CHECKSUM_DIR_SUFFIX = ".dir" HASH_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_VERIFY = False LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 5 TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True CACHE_MODE = None SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} PARAM_CHECKSUM = None state = StateNoop() def __init__(self, repo, config): self.repo = repo self.config = config self._check_requires(config) shared = config.get("shared") self._file_mode, self._dir_mode = self.SHARED_MODE_MAP[shared] self.verify = config.get("verify", self.DEFAULT_VERIFY) self.path_info = None @cached_property def jobs(self): return (self.config.get("jobs") or (self.repo and self.repo.config["core"].get("jobs")) or self.JOBS) @cached_property def hash_jobs(self): return (self.config.get("checksum_jobs") or (self.repo and self.repo.config["core"].get("checksum_jobs")) or self.HASH_JOBS) @classmethod def get_missing_deps(cls): import importlib missing = [] for package, module in cls.REQUIRES.items(): try: importlib.import_module(module) except ImportError: missing.append(package) return missing def _check_requires(self, config): missing = self.get_missing_deps() if not missing: return url = config.get("url", f"{self.scheme}://") msg = ("URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install 'dvc[{}]'\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install 'dvc[all]'\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!" ).format(url, missing, " ".join(missing), self.scheme) raise RemoteMissingDepsError(msg) @classmethod def supported(cls, config): if isinstance(config, (str, bytes)): url = config else: url = config["url"] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme @property def file_mode(self): return self._file_mode @property def dir_mode(self): return self._dir_mode @property def cache(self): return getattr(self.repo.cache, self.scheme) def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): func = self._generate_download_url # noqa,pylint:disable=no-member get_url = partial(func, path_info) return open_url(get_url, mode=mode, encoding=encoding) raise RemoteActionNotImplemented("open", self.scheme) def exists(self, path_info, use_dvcignore=True): raise NotImplementedError # pylint: disable=unused-argument def isdir(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return False def isfile(self, path_info): """Optional: Overwrite only if the remote has a way to distinguish between a directory and a file. """ return True def iscopy(self, path_info): """Check if this file is an independent copy.""" return False # We can't be sure by default def walk_files(self, path_info, **kwargs): """Return a generator with `PathInfo`s to all the files. Optional kwargs: prefix (bool): If true `path_info` will be treated as a prefix rather than directory path. """ raise NotImplementedError def is_empty(self, path_info): return False def remove(self, path_info): raise RemoteActionNotImplemented("remove", self.scheme) def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data """ def move(self, from_info, to_info, mode=None): assert mode is None self.copy(from_info, to_info) self.remove(from_info) def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) def copy_fobj(self, fobj, to_info): raise RemoteActionNotImplemented("copy_fobj", self.scheme) def symlink(self, from_info, to_info): raise RemoteActionNotImplemented("symlink", self.scheme) def hardlink(self, from_info, to_info): raise RemoteActionNotImplemented("hardlink", self.scheme) def reflink(self, from_info, to_info): raise RemoteActionNotImplemented("reflink", self.scheme) @staticmethod def protect(path_info): pass def is_protected(self, path_info): return False # pylint: enable=unused-argument @staticmethod def unprotect(path_info): pass @classmethod def is_dir_hash(cls, hash_): if not hash_: return False return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) @use_state def get_hash(self, path_info, **kwargs): assert path_info and (isinstance(path_info, str) or path_info.scheme == self.scheme) if not self.exists(path_info): return None # pylint: disable=assignment-from-none hash_info = self.state.get(path_info) # If we have dir hash in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context if (hash_info and hash_info.isdir and not self.cache.tree.exists( self.cache.tree.hash_to_path_info(hash_info.value))): hash_info = None if hash_info: assert hash_info.name == self.PARAM_CHECKSUM if hash_info.isdir: self.cache.set_dir_info(hash_info) return hash_info if self.isdir(path_info): hash_info = self.get_dir_hash(path_info, **kwargs) else: hash_info = self.get_file_hash(path_info) if hash_info and self.exists(path_info): self.state.save(path_info, hash_info) return hash_info def get_file_hash(self, path_info): raise NotImplementedError def hash_to_path_info(self, hash_): return self.path_info / hash_[0:2] / hash_[2:] def _calculate_hashes(self, file_infos): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: worker = pbar.wrap_fn(self.get_file_hash) with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: hash_infos = executor.map(worker, file_infos) return dict(zip(file_infos, hash_infos)) def _collect_dir(self, path_info, **kwargs): file_infos = set() for fname in self.walk_files(path_info, **kwargs): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) file_infos.add(fname) hash_infos = {fi: self.state.get(fi) for fi in file_infos} not_in_state = {fi for fi, hi in hash_infos.items() if hi is None} new_hash_infos = self._calculate_hashes(not_in_state) hash_infos.update(new_hash_infos) dir_info = DirInfo() for fi, hi in hash_infos.items(): # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix dir_info.trie[fi.relative_to(path_info).parts] = hi return dir_info @use_state def get_dir_hash(self, path_info, **kwargs): dir_info = self._collect_dir(path_info, **kwargs) hash_info = self.repo.cache.local.save_dir_info(dir_info) hash_info.size = dir_info.size return hash_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '%s' to '%s'", from_info, to_info) name = name or from_info.name self._upload( # noqa, pylint: disable=no-member from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError if self.isdir(from_info): return self._download_dir(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) return self._download_file(from_info, to_info, name, no_progress_bar, file_mode, dir_mode) def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, )) with ThreadPoolExecutor(max_workers=self.jobs) as executor: futures = [ executor.submit(download_files, from_info, to_info) for from_info, to_info in zip(from_infos, to_infos) ] # NOTE: unlike pulling/fetching cache, where we need to # download everything we can, not raising an error here might # turn very ugly, as the user might think that he has # downloaded a complete directory, while having a partial one, # which might cause unexpected results in his pipeline. for future in as_completed(futures): # NOTE: executor won't let us raise until all futures that # it has are finished, so we need to cancel them ourselves # before re-raising. exc = future.exception() if exc: for entry in futures: entry.cancel() raise exc def _download_file(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download( # noqa, pylint: disable=no-member from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) move(tmp_file, to_info, mode=file_mode)
def __init__( self, root_dir=None, fs=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, ): from dvc.config import Config from dvc.data.db import ODBManager from dvc.data_cloud import DataCloud from dvc.fs.git import GitFileSystem from dvc.fs.local import localfs from dvc.lock import LockNoop, make_lock from dvc.repo.live import Live from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} self._fs = fs or localfs self._scm = None if rev and not fs: self._scm = SCM(root_dir or os.curdir) self._fs = GitFileSystem(scm=self._scm, rev=rev) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, fs=self.fs, uninitialized=uninitialized) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized # used by RepoFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if isinstance(self.fs, GitFileSystem) or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) state_db_dir = self._get_database_dir("state") self.state = State(self.root_dir, state_db_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.live = Live(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0