def _changed(path_info, fs, obj, cache): logger.trace("checking if '%s'('%s') has changed.", path_info, obj) try: check(cache, obj) except (FileNotFoundError, ObjectFormatError): logger.debug("cache for '%s'('%s') has changed.", path_info, obj.hash_info) return True try: actual = stage(cache, path_info, fs, obj.hash_info.name).hash_info except FileNotFoundError: logger.debug("'%s' doesn't exist.", path_info) return True if obj.hash_info != actual: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", obj.hash_info, actual, path_info, ) return True logger.trace("'%s' hasn't changed.", path_info) return False
def _collect_used_dir_cache(self, remote=None, force=False, jobs=None, filter_info=None) -> Optional["Tree"]: """Fetch dir cache and return used object IDs for this out.""" try: self.get_dir_cache(jobs=jobs, remote=remote) except DvcException: logger.debug(f"failed to pull cache for '{self}'") try: objects.check(self.odb, self.odb.get(self.hash_info)) except FileNotFoundError: msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force.") if not force and not prompt.confirm(msg.format(self.path_info)): raise CollectCacheError( "unable to fully collect used cache" " without cache for directory '{}'".format(self)) return None obj = self.get_obj() if filter_info and filter_info != self.path_info: prefix = filter_info.relative_to(self.path_info).parts obj = obj.filter(prefix) return obj
def collect_used_dir_cache( self, remote=None, force=False, jobs=None, filter_info=None) -> Dict[Optional["ObjectDB"], Set["HashFile"]]: """Fetch dir cache and return used objects for this out.""" try: self.get_dir_cache(jobs=jobs, remote=remote) except DvcException: logger.debug(f"failed to pull cache for '{self}'") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force.") if not force and not prompt.confirm(msg.format(self.path_info)): raise CollectCacheError( "unable to fully collect used cache" " without cache for directory '{}'".format(self)) return {} obj = self.get_obj(filter_info=filter_info, copy=True) self._set_obj_names(obj) return {None: {obj}}
def collect_used_dir_cache( self, remote=None, force=False, jobs=None, filter_info=None ): """Get a list of `info`s related to the given directory. - Pull the directory entry from the remote cache if it was changed. Example: Given the following commands: $ echo "foo" > directory/foo $ echo "bar" > directory/bar $ dvc add directory It will return a NamedCache like: nc = NamedCache() nc.add(self.scheme, 'c157a79031e1', 'directory/foo') nc.add(self.scheme, 'd3b07384d113', 'directory/bar') """ cache = NamedCache() try: self.get_dir_cache(jobs=jobs, remote=remote) except DvcException: logger.debug(f"failed to pull cache for '{self}'") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): msg = ( "Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force." ) if not force and not prompt.confirm(msg.format(self.path_info)): raise CollectCacheError( "unable to fully collect used cache" " without cache for directory '{}'".format(self) ) return cache path = str(self.path_info) filter_path = str(filter_info) if filter_info else None for entry_key, entry_obj in self.obj: entry_path = os.path.join(path, *entry_key) if ( not filter_path or entry_path == filter_path or entry_path.startswith(filter_path + os.sep) ): cache.add(self.scheme, entry_obj.hash_info.value, entry_path) return cache
def checkout( path_info, fs, obj, cache, force=False, progress_callback=None, relink=False, quiet=False, ): if path_info.scheme not in ["local", cache.fs.scheme]: raise NotImplementedError failed = None skip = False if not obj: if not quiet: logger.warning( "No file hash info found for '%s'. It won't be created.", path_info, ) _remove(path_info, fs, cache, force=force) failed = path_info elif not relink and not _changed(path_info, fs, obj, cache): logger.trace("Data '%s' didn't change.", path_info) skip = True else: try: check(cache, obj) except (FileNotFoundError, ObjectFormatError): if not quiet: logger.warning( "Cache '%s' not found. File '%s' won't be created.", obj.hash_info, path_info, ) _remove(path_info, fs, cache, force=force) failed = path_info if failed or skip: if progress_callback and obj: progress_callback( str(path_info), len(obj), ) if failed: raise CheckoutError([failed]) return logger.debug("Checking out '%s' with cache '%s'.", path_info, obj) return _checkout( path_info, fs, obj, cache, force, progress_callback, relink, )
def changed_cache(self, filter_info=None): if not self.use_cache or not self.hash_info: return True obj = self.get_obj(filter_info=filter_info) if not obj: return True try: objects.check(self.odb, obj) return False except (FileNotFoundError, ObjectFormatError): return True
def test_staging_file(tmp_dir, dvc): from dvc.objects import check from dvc.objects.stage import stage from dvc.objects.transfer import transfer tmp_dir.gen("foo", "foo") fs = LocalFileSystem() local_odb = dvc.odb.local staging_odb, obj = stage(local_odb, tmp_dir / "foo", fs, "md5") assert not local_odb.exists(obj.hash_info) assert staging_odb.exists(obj.hash_info) with pytest.raises(FileNotFoundError): check(local_odb, obj) check(staging_odb, obj) transfer(staging_odb, local_odb, {obj.hash_info}, move=True) check(local_odb, obj) with pytest.raises(FileNotFoundError): check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info)
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") obj = self.odb.get(self.hash_info) try: objects.check(self.odb, obj) except FileNotFoundError: self.repo.cloud.pull([obj], **kwargs) try: self.obj = objects.load(self.odb, self.hash_info) except (FileNotFoundError, ObjectFormatError): self.obj = None return self.obj
def _remove(path_info, fs, cache, force=False): if not fs.exists(path_info): return if force: fs.remove(path_info) return current = stage(cache, path_info, fs, fs.PARAM_CHECKSUM).hash_info try: obj = load(cache, current) check(cache, obj) except (FileNotFoundError, ObjectFormatError): msg = (f"file/directory '{path_info}' is going to be removed. " "Are you sure you want to proceed?") if not prompt.confirm(msg): raise ConfirmRemoveError(str(path_info)) fs.remove(path_info)
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): self.repo.cloud.pull( NamedCache.make("local", self.hash_info.value, str(self)), show_checksums=False, **kwargs, ) try: self.obj = objects.load(self.odb, self.hash_info) except (FileNotFoundError, ObjectFormatError): self.obj = None return self.obj
def test_get_hash_dirty_file(tmp_dir, dvc): from dvc.objects import check from dvc.objects.errors import ObjectFormatError from dvc.objects.stage import get_file_hash tmp_dir.dvc_gen("file", "file") file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") (tmp_dir / "file").write_text("something") something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") clean_staging() # file is modified in workspace # get_file_hash(file) should return workspace hash, not DVC cached hash fs = RepoFileSystem(repo=dvc) assert fs.info(PathInfo(tmp_dir) / "file").get("md5") is None staging, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs, "md5") assert obj.hash_info == something_hash_info check(staging, obj) # file is removed in workspace # any staged object referring to modified workspace obj is now invalid (tmp_dir / "file").unlink() with pytest.raises(ObjectFormatError): check(staging, obj) # get_file_hash(file) should return DVC cached hash assert fs.info(PathInfo(tmp_dir) / "file")["md5"] == file_hash_info.value _, hash_info = get_file_hash(PathInfo(tmp_dir) / "file", fs, "md5", state=dvc.state) assert hash_info == file_hash_info # tmp_dir/file can be staged even though it is missing in workspace since # repofs will use the DVC cached hash (and refer to the local cache object) _, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs, "md5") assert obj.hash_info == file_hash_info