def _paths_checksums(): """ A dictionary of checksums addressed by relpaths collected from the current tree outputs. To help distinguish between a directory and a file output, the former one will come with a trailing slash in the path: directory: "data/" file: "data" """ def _to_path(output): return (str(output) if not output.is_dir_checksum else os.path.join(str(output), "")) on_working_tree = is_working_tree(self.tree) def _to_checksum(output): if on_working_tree: return self.cache.local.get_checksum(output.path_info) return output.checksum def _exists(output): if on_working_tree: return output.exists return True return { _to_path(output): _to_checksum(output) for stage in self.stages for output in stage.outs if _exists(output) }
def _save_file(self, path_info, checksum, save_link=True, tree=None): assert checksum cache_info = self.checksum_to_path_info(checksum) if tree: if self.changed_cache(checksum): with tree.open(path_info, mode="rb") as fobj: self.copy_fobj(fobj, cache_info) else: if self.changed_cache(checksum): self.move(path_info, cache_info, mode=self.CACHE_MODE) self.link(cache_info, path_info) elif self.iscopy(path_info) and self._cache_is_copy(path_info): # Default relink procedure involves unneeded copy self.unprotect(path_info) else: self.remove(path_info) self.link(cache_info, path_info) if save_link: self.state.save_link(path_info) # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation if not tree or is_working_tree(tree): self.state.save(path_info, checksum) self.state.save(cache_info, checksum)
def get_mtime_and_size(path, tree): if os.path.isdir(fspath_py35(path)): assert is_working_tree(tree) size = 0 files_mtimes = {} for file_path in tree.walk_files(path): try: stat = os.stat(file_path) except OSError as exc: # NOTE: broken symlink case. if exc.errno != errno.ENOENT: raise continue size += stat.st_size files_mtimes[file_path] = stat.st_mtime # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) mtime = dict_md5(files_mtimes) else: base_stat = os.stat(fspath_py35(path)) size = base_stat.st_size mtime = base_stat.st_mtime mtime = int(nanotime.timestamp(mtime)) # State of files handled by dvc is stored in db as TEXT. # We cast results to string for later comparisons with stored values. return str(mtime), str(size)
def _save_dir( self, path_info, checksum, save_link=True, tree=None, **kwargs ): if tree: dir_info = self._collect_dir( path_info, tree=tree, save_tree=True, **kwargs ) checksum = self._save_dir_info(dir_info) else: dir_info = self.get_dir_cache(checksum) for entry in Tqdm( dir_info, desc="Saving " + path_info.name, unit="file" ): entry_info = path_info / entry[self.PARAM_RELPATH] entry_checksum = entry[self.PARAM_CHECKSUM] self._save_file(entry_info, entry_checksum, save_link=False) if save_link: self.state.save_link(path_info) cache_info = self.checksum_to_path_info(checksum) self.state.save(cache_info, checksum) if not tree or is_working_tree(tree): self.state.save(path_info, checksum) return {self.PARAM_CHECKSUM: checksum}
def __init__(self, stage, path, *args, **kwargs): if stage and path_isin(path, stage.repo.root_dir): path = relpath(path, stage.wdir) super().__init__(stage, path, *args, **kwargs) if self.is_in_repo and self.repo and is_working_tree(self.repo.tree): self.tree = self.repo.tree
def work_tree(self): # When using repo.brancher, repo.tree may change to/from WorkingTree to # GitTree arbitarily. When repo.tree is GitTree, local cache needs to # use its own WorkingTree instance. if self.repo and not is_working_tree(self.repo.tree): return self._work_tree return None
def tree(self, tree): if is_working_tree(tree) or tree.tree_root == self.root_dir: root = None else: root = self.root_dir self._tree = (tree if isinstance(tree, CleanTree) else CleanTree( tree, root)) # Our graph cache is no longer valid, as it was based on the previous # tree. self._reset()
def _save_file( self, path_info, checksum, save_link=True, tree=None, **kwargs ): assert checksum cache_info = self.checksum_to_path_info(checksum) if tree: if self.changed_cache(checksum): with tree.open(path_info, mode="rb") as fobj: # if tree has fetch enabled, DVC out will be fetched on # open and we do not need to read/copy any data if not ( tree.isdvc(path_info, strict=False) and tree.fetch ): self.tree.copy_fobj(fobj, cache_info) callback = kwargs.get("download_callback") if callback: callback(1) else: if self.changed_cache(checksum): self.tree.move(path_info, cache_info, mode=self.CACHE_MODE) self.link(cache_info, path_info) elif self.tree.iscopy(path_info) and self._cache_is_copy( path_info ): # Default relink procedure involves unneeded copy self.unprotect(path_info) else: self.tree.remove(path_info) self.link(cache_info, path_info) if save_link: self.state.save_link(path_info) # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation if not tree or is_working_tree(tree): self.state.save(path_info, checksum) self.state.save(cache_info, checksum) return {self.PARAM_CHECKSUM: checksum}
def _unprotect_dir(self, path): assert is_working_tree(self.repo.tree) for fname in self.repo.tree.walk_files(path): RemoteLOCAL._unprotect_file(fname)
def walk_files(self, path_info): assert is_working_tree(self.repo.tree) for fname in self.repo.tree.walk_files(path_info): yield PathInfo(fname)
def get_rev(self): if is_working_tree(self.tree): return self.scm.get_rev() if hasattr(self.tree, "tree"): return self.tree.tree.rev return self.tree.rev
def exists(self, path_info): assert is_working_tree(self.repo.tree) assert isinstance(path_info, str) or path_info.scheme == "local" return self.repo.tree.exists(fspath_py35(path_info))