def reflink(self, from_info, to_info): tmp_info = to_info.parent / tmp_fname(to_info.name) System.reflink(from_info, tmp_info) # NOTE: reflink has its own separate inode, so you can set permissions # that are different from the source. os.chmod(fspath_py35(tmp_info), self._file_mode) os.rename(fspath_py35(tmp_info), fspath_py35(to_info))
def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" from dvc.exceptions import DvcException from dvc.progress import Tqdm from dvc.system import System src = fspath_py35(src) dest = fspath_py35(dest) name = name if name else os.path.basename(dest) total = os.stat(src).st_size if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) try: System.reflink(src, dest) except DvcException: with Tqdm(desc=name, disable=no_progress_bar, total=total, bytes=True) as pbar: with open(src, "rb") as fsrc, open(dest, "wb+") as fdest: while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) pbar.update(len(buf))
def erepo_dir(tmp_path_factory, monkeypatch): from dvc.repo import Repo from dvc.remote.config import RemoteConfig path = TmpDir(fspath_py35(tmp_path_factory.mktemp("erepo"))) # Chdir for git and dvc to work locally with monkeypatch.context() as m: m.chdir(fspath_py35(path)) _git_init() path.dvc = Repo.init() path.scm = path.dvc.scm path.dvc_gen(REPO_TEMPLATE, commit="init repo") rconfig = RemoteConfig(path.dvc.config) rconfig.add("upstream", path.dvc.cache.local.cache_dir, default=True) path.scm_add([path.dvc.config.config_file], commit="add remote") path.dvc_gen("version", "master") path.scm_add([".gitignore", "version.dvc"], commit="master") path.scm.checkout("branch", create_new=True) (path / "version").unlink() # For mac ??? path.dvc_gen("version", "branch") path.scm_add([".gitignore", "version.dvc"], commit="branch") path.scm.checkout("master") path.dvc.close() return path
def test_update_rev(tmp_dir, dvc, scm, git_dir): with git_dir.chdir(): git_dir.scm_gen({"foo": "foo"}, commit="first") dvc.imp(fspath(git_dir), "foo") assert (tmp_dir / "foo.dvc").exists() with git_dir.chdir(), git_dir.branch("branch1", new=True): git_dir.scm_gen({"foo": "foobar"}, commit="branch1 commit") branch1_head = git_dir.scm.get_rev() with git_dir.chdir(), git_dir.branch("branch2", new=True): git_dir.scm_gen({"foo": "foobar foo"}, commit="branch2 commit") branch2_head = git_dir.scm.get_rev() stage = dvc.update("foo.dvc", rev="branch1") assert stage.deps[0].def_repo == { "url": fspath(git_dir), "rev": "branch1", "rev_lock": branch1_head, } with open(fspath_py35(tmp_dir / "foo")) as f: assert "foobar" == f.read() stage = dvc.update("foo.dvc", rev="branch2") assert stage.deps[0].def_repo == { "url": fspath(git_dir), "rev": "branch2", "rev_lock": branch2_head, } with open(fspath_py35(tmp_dir / "foo")) as f: assert "foobar foo" == f.read()
def copy(self, from_info, to_info): tmp_info = to_info.parent / tmp_fname(to_info.name) try: System.copy(from_info, tmp_info) os.rename(fspath_py35(tmp_info), fspath_py35(to_info)) except Exception: self.remove(tmp_info) raise
def _read_params(repo, configs, rev): res = {} for config in configs: if not repo.tree.exists(fspath_py35(config)): continue with repo.tree.open(fspath_py35(config), "r") as fobj: try: res[str(config)] = yaml.safe_load(fobj) except yaml.YAMLError: logger.debug( "failed to read '%s' on '%s'", config, rev, exc_info=True ) continue return res
def is_protected(self, path_info): if not self.exists(path_info): return False mode = os.stat(fspath_py35(path_info)).st_mode return stat.S_IMODE(mode) == self.CACHE_MODE
def get(self, path_info): """Gets the checksum for the specified path info. Checksum will be retrieved from the state database if available. Args: path_info (dict): path info to get the checksum for. Returns: str or None: checksum for the specified path info or None if it doesn't exist in the state database. """ assert path_info.scheme == "local" path = fspath_py35(path_info) if not os.path.exists(path): return None actual_mtime, actual_size = get_mtime_and_size(path, self.repo.tree) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: return None mtime, size, checksum, _ = existing_record if self._file_metadata_changed(actual_mtime, mtime, actual_size, size): return None self._update_state_record_timestamp_for_inode(actual_inode) return checksum
def chdir(self): old = os.getcwd() try: os.chdir(fspath_py35(self)) yield finally: os.chdir(old)
def save(self, path_info, checksum): """Save checksum for the specified path info. Args: path_info (dict): path_info to save checksum for. checksum (str): checksum to save. """ assert path_info.scheme == "local" assert checksum is not None assert os.path.exists(fspath_py35(path_info)) actual_mtime, actual_size = get_mtime_and_size( path_info, self.repo.tree ) actual_inode = get_inode(path_info) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: self._insert_new_state_record( actual_inode, actual_mtime, actual_size, checksum ) return self._update_state_for_path_changed( actual_inode, actual_mtime, actual_size, checksum )
def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): tmp_dir.gen(".dvcignore", "*.backup\ntmp") ext_dir = TmpDir(fspath_py35(tmp_path_factory.mktemp("external_dir"))) ext_dir.gen({"y.backup": "y", "tmp": "ext tmp"}) remote = RemoteLOCAL(dvc, {}) result = {relpath(f, ext_dir) for f in remote.walk_files(ext_dir)} assert result == {"y.backup", "tmp"}
def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): makedirs(to_info.parent, exist_ok=True) tmp_file = tmp_fname(to_info) copyfile( from_file, tmp_file, name=name, no_progress_bar=no_progress_bar ) os.rename(tmp_file, fspath_py35(to_info))
def test_pipeline_file_target_ops(tmp_dir, dvc, local_remote, run_copy): tmp_dir.dvc_gen("foo", "foo") run_copy("foo", "bar", single_stage=True) tmp_dir.dvc_gen("lorem", "lorem") run_copy("lorem", "lorem2", name="copy-lorem-lorem2") tmp_dir.dvc_gen("ipsum", "ipsum") run_copy("ipsum", "baz", name="copy-ipsum-baz") outs = ["foo", "bar", "lorem", "ipsum", "baz", "lorem2"] remove(dvc.stage_cache.cache_dir) dvc.push() # each one's a copy of other, hence 3 assert len(recurse_list_dir(fspath_py35(local_remote))) == 3 clean(outs, dvc) assert set(dvc.pull(["dvc.yaml"])["added"]) == {"lorem2", "baz"} clean(outs, dvc) assert set(dvc.pull()["added"]) == set(outs) # clean everything in remote and push clean(local_remote.iterdir()) dvc.push(["dvc.yaml:copy-ipsum-baz"]) assert len(recurse_list_dir(fspath_py35(local_remote))) == 1 clean(local_remote.iterdir()) dvc.push(["dvc.yaml"]) assert len(recurse_list_dir(fspath_py35(local_remote))) == 2 with pytest.raises(StageNotFound): dvc.push(["dvc.yaml:StageThatDoesNotExist"]) with pytest.raises(StageNotFound): dvc.pull(["dvc.yaml:StageThatDoesNotExist"])
def _read_metrics(repo, metrics, rev): tree = RepoTree(repo) res = {} for metric in metrics: if not tree.exists(fspath_py35(metric)): continue with tree.open(fspath_py35(metric), "r") as fobj: try: # NOTE this also supports JSON val = yaml.safe_load(fobj) except yaml.YAMLError: logger.debug( "failed to read '%s' on '%s'", metric, rev, exc_info=True ) continue val = _extract_metrics(val) if val: res[str(metric)] = val return res
def makedirs(path, exist_ok=False, mode=None): path = fspath_py35(path) if mode is None: os.makedirs(path, exist_ok=exist_ok) return # utilize umask to set proper permissions since Python 3.7 the `mode` # `makedirs` argument no longer affects the file permission bits of # newly-created intermediate-level directories. umask = os.umask(0o777 - mode) try: os.makedirs(path, exist_ok=exist_ok) finally: os.umask(umask)
def verify_metric(self): if not self.metric: return path = fspath_py35(self.path_info) if not os.path.exists(path): return if os.path.isdir(path): msg = "directory '{}' cannot be used as metrics." raise DvcException(msg.format(self.path_info)) if not istextfile(path): msg = "binary file '{}' cannot be used as metrics." raise DvcException(msg.format(self.path_info))
def protect(path_info): path = fspath_py35(path_info) mode = stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH try: os.chmod(path, mode) except OSError as exc: # In share cache scenario, we might not own the cache file, so we # need to check if cache file is already protected. if exc.errno not in [errno.EPERM, errno.EACCES]: raise actual = os.stat(path).st_mode if actual & mode != mode: raise
def _get_info(self): if not self.exists: return {} with open(fspath_py35(self.path_info), "r") as fobj: try: config = yaml.safe_load(fobj) except yaml.YAMLError as exc: raise BadParamFileError( "Unable to read parameters from '{}'".format(self) ) from exc ret = {} for param in self.params: ret[param] = dpath.util.get(config, param, separator=".") return ret
def _parse_path(self, remote, path): parsed = urlparse(path) if parsed.scheme == "remote": p = remote.path_info / parsed.path.lstrip("/") else: # NOTE: we can path either from command line or .dvc file, # so we should expect both posix and windows style paths. # PathInfo accepts both, i.e. / works everywhere, \ only on win. # # FIXME: if we have Windows path containing / or posix one with \ # then we have #2059 bug and can't really handle that. p = self.REMOTE.path_cls(path) if not p.is_absolute(): p = self.stage.wdir / p abs_p = os.path.abspath(os.path.normpath(fspath_py35(p))) return self.REMOTE.path_cls(abs_p)
def test_isdir_isfile(tmp_dir, dvc): tmp_dir.gen({"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}}) tree = DvcTree(dvc) assert not tree.isdir("datadir") assert not tree.isfile("datadir") assert not tree.isdir("datafile") assert not tree.isfile("datafile") dvc.add(["datadir", "datafile"]) shutil.rmtree(fspath_py35(tmp_dir / "datadir")) (tmp_dir / "datafile").unlink() assert tree.isdir("datadir") assert not tree.isfile("datadir") assert not tree.isdir("datafile") assert tree.isfile("datafile")
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile fname = fspath_py35(fname) if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( "Computing md5 for a large file '{}'. This is only done once." ) logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc=name, disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with open(fname, "rb") as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) pbar.update(len(data)) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def save_link(self, path_info): """Adds the specified path to the list of links created by dvc. This list is later used on `dvc checkout` to cleanup old links. Args: path_info (dict): path info to add to the list of links. """ assert path_info.scheme == "local" if not os.path.exists(fspath_py35(path_info)): return mtime, _ = get_mtime_and_size(path_info, self.repo.tree) inode = get_inode(path_info) relative_path = relpath(path_info, self.root_dir) cmd = "REPLACE INTO {}(path, inode, mtime) " "VALUES (?, ?, ?)".format( self.LINK_STATE_TABLE) self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime))
def protect(self, path_info): path = fspath_py35(path_info) mode = self.CACHE_MODE try: os.chmod(path, mode) except OSError as exc: # There is nothing we need to do in case of a read-only file system if exc.errno == errno.EROFS: return # In shared cache scenario, we might not own the cache file, so we # need to check if cache file is already protected. if exc.errno not in [errno.EPERM, errno.EACCES]: raise actual = stat.S_IMODE(os.stat(path).st_mode) if actual != mode: raise
def erepo_dir(tmp_path_factory, monkeypatch): from dvc.repo import Repo from dvc.remote.config import RemoteConfig path = TmpDir(fspath_py35(tmp_path_factory.mktemp("erepo"))) # Chdir for git and dvc to work locally with path.chdir(): _git_init() path.dvc = Repo.init() path.scm = path.dvc.scm path.scm.commit("init dvc") rconfig = RemoteConfig(path.dvc.config) rconfig.add("upstream", path.dvc.cache.local.cache_dir, default=True) path.scm_add([path.dvc.config.config_file], commit="add remote") path.dvc.close() return path
def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): raise ValueError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) return open(cache_file, mode=mode, encoding=encoding)
def isfile(path_info): return os.path.isfile(fspath_py35(path_info))
def exists(path_info): assert path_info.scheme == "local" return os.path.lexists(fspath_py35(path_info))
def isdir(path_info): return os.path.isdir(fspath_py35(path_info))
def open(path_info, mode="r", encoding=None): return open(fspath_py35(path_info), mode=mode, encoding=encoding)
def getsize(path_info): return os.path.getsize(fspath_py35(path_info))