def erepo_dir(tmp_path_factory, monkeypatch): from dvc.repo import Repo from dvc.remote.config import RemoteConfig path = TmpDir(fspath_py35(tmp_path_factory.mktemp("erepo"))) # Chdir for git and dvc to work locally monkeypatch.chdir(fspath_py35(path)) _git_init() path.dvc = Repo.init() path.scm = path.dvc.scm path.dvc_gen(REPO_TEMPLATE, commit="init repo") rconfig = RemoteConfig(path.dvc.config) rconfig.add("upstream", path.dvc.cache.local.cache_dir, default=True) path.scm_add([path.dvc.config.config_file], commit="add remote") path.dvc_gen("version", "master") path.scm_add([".gitignore", "version.dvc"], commit="master") path.scm.checkout("branch", create_new=True) (path / "version").unlink() # For mac ??? path.dvc_gen("version", "branch") path.scm_add([".gitignore", "version.dvc"], commit="branch") path.scm.checkout("master") path.dvc.close() monkeypatch.undo() # Undo chdir return path
def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info.scheme != "local": raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) if not name: name = from_info.name makedirs(fspath_py35(to_info.parent), exist_ok=True) tmp_file = tmp_fname(to_info) try: copyfile( fspath_py35(from_info), tmp_file, name=name, no_progress_bar=no_progress_bar, ) os.rename(tmp_file, fspath_py35(to_info)) except Exception: logger.exception( "failed to upload '{}' to '{}'".format(from_info, to_info) )
def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" from dvc.exceptions import DvcException from dvc.progress import Tqdm from dvc.system import System src = fspath_py35(src) dest = fspath_py35(dest) name = name if name else os.path.basename(dest) total = os.stat(src).st_size if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) try: System.reflink(src, dest) except DvcException: with Tqdm(desc=name, disable=no_progress_bar, total=total, bytes=True) as pbar: with open(src, "rb") as fsrc, open(dest, "wb+") as fdest: while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) pbar.update(len(buf))
def download( self, from_infos, to_infos, names=None, no_progress_bar=False, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) tmp_file = tmp_fname(to_info) if not name: name = to_info.name cb = None if no_progress_bar else Callback(name) makedirs(fspath_py35(to_info.parent), exist_ok=True) try: self.oss_service.get_object_to_file(from_info.path, tmp_file, progress_callback=cb) except Exception: logger.warning("failed to download '{}'".format(from_info)) else: move(tmp_file, fspath_py35(to_info)) finally: if not no_progress_bar: progress.finish_target(name)
def relpath(path, start=os.curdir): path = fspath_py35(path) start = fspath_py35(os.path.abspath(start)) # Windows path on different drive than curdir doesn't have relpath if os.name == "nt" and not os.path.commonprefix( [start, os.path.abspath(path)]): return path return os.path.relpath(path, start)
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) names = self._verify_path_args(from_infos, to_infos, names) with self.transfer_context() as ctx: for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info, ctx=ctx) continue if to_info.scheme != "local": raise NotImplementedError msg = "Downloading '{}' to '{}'".format(from_info, to_info) logger.debug(msg) tmp_file = tmp_fname(to_info) if not name: name = to_info.name if not no_progress_bar: # real progress is not always available, # lets at least show start and finish progress.update_target(name, 0, None) makedirs(fspath_py35(to_info.parent), exist_ok=True) try: self._download( from_info, tmp_file, name=name, ctx=ctx, resume=resume, no_progress_bar=no_progress_bar, ) except Exception: msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) continue move(tmp_file, fspath_py35(to_info)) if not no_progress_bar: progress.finish_target(name)
def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): makedirs(fspath_py35(to_info.parent), exist_ok=True) tmp_file = tmp_fname(to_info) copyfile( from_file, tmp_file, name=name, no_progress_bar=no_progress_bar ) os.rename(tmp_file, fspath_py35(to_info))
def list_cache_paths(self): assert self.path_info is not None clist = [] for entry in os.listdir(fspath_py35(self.path_info)): subdir = self.path_info / entry if not os.path.isdir(fspath_py35(subdir)): continue clist.extend(subdir / cache for cache in os.listdir(fspath_py35(subdir))) return clist
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) s3 = self.s3 for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != "s3": raise NotImplementedError if to_info.scheme == "s3": self.copy(from_info, to_info, s3=s3) continue if to_info.scheme != "local": raise NotImplementedError msg = "Downloading '{}' to '{}'".format(from_info, to_info) logger.debug(msg) tmp_file = tmp_fname(to_info) if not name: name = to_info.name makedirs(fspath_py35(to_info.parent), exist_ok=True) try: if no_progress_bar: cb = None else: total = s3.head_object( Bucket=from_info.bucket, Key=from_info.path )["ContentLength"] cb = Callback(name, total) s3.download_file( from_info.bucket, from_info.path, tmp_file, Callback=cb ) except Exception: msg = "failed to download '{}'".format(from_info) logger.exception(msg) continue move(tmp_file, fspath_py35(to_info)) if not no_progress_bar: progress.finish_target(name)
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) gs = self.gs for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != "gs": raise NotImplementedError if to_info.scheme == "gs": self.copy(from_info, to_info, gs=gs) continue if to_info.scheme != "local": raise NotImplementedError msg = "Downloading '{}' to '{}'".format(from_info, to_info) logger.debug(msg) tmp_file = tmp_fname(to_info) if not name: name = to_info.name if not no_progress_bar: # percent_cb is not available for download_to_filename, so # lets at least update progress at pathpoints(start, finish) progress.update_target(name, 0, None) makedirs(fspath_py35(to_info.parent), exist_ok=True) try: bucket = gs.bucket(from_info.bucket) blob = bucket.get_blob(from_info.path) blob.download_to_filename(tmp_file) except Exception: msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) continue move(tmp_file, fspath_py35(to_info)) if not no_progress_bar: progress.finish_target(name)
def get(self, path_info): """Gets the checksum for the specified path info. Checksum will be retrieved from the state database if available. Args: path_info (dict): path info to get the checksum for. Returns: str or None: checksum for the specified path info or None if it doesn't exist in the state database. """ assert path_info.scheme == "local" path = fspath_py35(path_info) if not os.path.exists(path): return None actual_mtime, actual_size = get_mtime_and_size(path, self.repo.dvcignore) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: return None mtime, size, checksum, _ = existing_record if self._file_metadata_changed(actual_mtime, mtime, actual_size, size): return None self._update_state_record_timestamp_for_inode(actual_inode) return checksum
def save_link(self, path_info): """Adds the specified path to the list of links created by dvc. This list is later used on `dvc checkout` to cleanup old links. Args: path_info (dict): path info to add to the list of links. """ assert path_info.scheme == "local" path = fspath_py35(path_info) if not os.path.exists(path): return mtime, _ = get_mtime_and_size(path) inode = get_inode(path) relative_path = relpath(path, self.root_dir) cmd = ("REPLACE INTO {}(path, inode, mtime) " 'VALUES ("{}", {}, "{}")'.format( self.LINK_STATE_TABLE, relative_path, self._to_sqlite(inode), mtime, )) self._execute(cmd)
def open(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" out, = self.find_outs_by_path(path) if out.isdir(): raise ValueError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return _open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) # Since pull may just skip with a warning, we need to check it here if not os.path.exists(cache_file): raise OutputFileMissingError(relpath(path, self.root_dir)) return _open(cache_file, mode=mode, encoding=encoding)
def save(self, path_info, checksum): """Save checksum for the specified path info. Args: path_info (dict): path_info to save checksum for. checksum (str): checksum to save. """ assert path_info.scheme == "local" assert checksum is not None path = fspath_py35(path_info) assert os.path.exists(path) actual_mtime, actual_size = get_mtime_and_size(path, self.repo.dvcignore) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: self._insert_new_state_record(actual_inode, actual_mtime, actual_size, checksum) return self._update_state_for_path_changed(actual_inode, actual_mtime, actual_size, checksum)
def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): names = self._verify_path_args(to_infos, from_infos, names) s3 = self.s3 for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info.scheme != "s3": raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) if not name: name = from_info.name total = os.path.getsize(fspath_py35(from_info)) cb = None if no_progress_bar else Callback(name, total) try: s3.upload_file( from_info.fspath, to_info.bucket, to_info.path, Callback=cb, ExtraArgs=self.extra_args, ) except Exception: msg = "failed to upload '{}'".format(from_info) logger.exception(msg) continue progress.finish_target(name)
def _download( self, from_info, to_file, name=None, no_progress_bar=False, **_kwargs ): copyfile( fspath_py35(from_info), to_file, no_progress_bar=no_progress_bar, name=name, )
def download(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name if not no_progress_bar: # real progress is not always available, # lets at least show start and finish progress.update_target(name, 0, None) makedirs(fspath_py35(to_info.parent), exist_ok=True) tmp_file = tmp_fname(to_info) try: self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) except Exception: msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail move(tmp_file, fspath_py35(to_info)) if not no_progress_bar: progress.finish_target(name) return 0
def makedirs(path, exist_ok=False, mode=None): path = fspath_py35(path) if mode is None: _makedirs(path, exist_ok=exist_ok) return umask = os.umask(0) try: _makedirs(path, exist_ok=exist_ok, mode=mode) finally: os.umask(umask)
def download( self, from_infos, to_infos, names=None, no_progress_bar=False, resume=False, ): names = self._verify_path_args(to_infos, from_infos, names) fails = 0 for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme != "local": raise NotImplementedError msg = "Downloading '{}' to '{}'".format(from_info, to_info) logger.debug(msg) if not name: name = to_info.name makedirs(fspath_py35(to_info.parent), exist_ok=True) total = self._content_length(from_info.url) if no_progress_bar or not total: cb = None else: cb = ProgressBarCallback(name, total) try: self._download_to(from_info.url, to_info.fspath, callback=cb, resume=resume) except Exception: fails += 1 msg = "failed to download '{}'".format(from_info) logger.exception(msg) continue if not no_progress_bar: progress.finish_target(name) return fails
def makedirs(path, exist_ok=False, mode=None): path = fspath_py35(path) if mode is None: _makedirs(path, exist_ok=exist_ok) return # utilize umask to set proper permissions since Python 3.7 the `mode` # `makedirs` argument no longer affects the file permission bits of # newly-created intermediate-level directories. umask = os.umask(0o777 - mode) try: _makedirs(path, exist_ok=exist_ok) finally: os.umask(umask)
def dvc_walk(top, dvcignore, topdown=True, onerror=None, followlinks=False): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. """ top = fspath_py35(top) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): if dvcignore: dirs[:], files[:] = dvcignore(root, dirs, files) yield root, dirs, files
def move(src, dst, mode=None): """Atomically move src to dst and chmod it with mode. Moving is performed in two stages to make the whole operation atomic in case src and dst are on different filesystems and actual physical copying of data is happening. """ src = fspath_py35(src) dst = fspath_py35(dst) dst = os.path.abspath(dst) tmp = "{}.{}".format(dst, str(uuid())) if os.path.islink(src): shutil.copy(os.readlink(src), tmp) os.unlink(src) else: shutil.move(src, tmp) if mode is not None: os.chmod(tmp, mode) shutil.move(tmp, dst)
def protect(path_info): path = fspath_py35(path_info) mode = stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH try: os.chmod(path, mode) except OSError as exc: # In share cache scenario, we might not own the cache file, so we # need to check if cache file is already protected. if exc.errno not in [errno.EPERM, errno.EACCES]: raise actual = os.stat(path).st_mode if actual & mode != mode: raise
def verify_metric(self): if not self.metric: return path = fspath_py35(self.path_info) if not os.path.exists(path): return if os.path.isdir(path): msg = "directory '{}' cannot be used as metrics." raise DvcException(msg.format(self.path_info)) if not istextfile(path): msg = "binary file '{}' cannot be used as metrics." raise DvcException(msg.format(self.path_info))
def _get_cache_type(self, path_info): if self.cache_type_confirmed: return self.cache_types[0] workspace_file = path_info.with_name("." + uuid()) test_cache_file = self.path_info / ".cache_type_test_file" if not self.exists(test_cache_file): with open(fspath_py35(test_cache_file), "wb") as fobj: fobj.write(bytes(1)) try: self.link(test_cache_file, workspace_file) finally: self.remove(workspace_file) self.remove(test_cache_file) self.cache_type_confirmed = True return self.cache_types[0]
def _parse_path(self, remote, path): parsed = urlparse(path) if parsed.scheme == "remote": p = remote.path_info / parsed.path.lstrip("/") else: # NOTE: we can path either from command line or .dvc file, # so we should expect both posix and windows style paths. # PathInfo accepts both, i.e. / works everywhere, \ only on win. # # FIXME: if we have Windows path containig / or posix one with \ # then we have #2059 bug and can't really handle that. p = self.REMOTE.path_cls(path) if not p.is_absolute(): p = self.stage.wdir / p abs_p = os.path.abspath(os.path.normpath(fspath_py35(p))) return self.REMOTE.path_cls(abs_p)
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile fname = fspath_py35(fname) if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( "Computing md5 for a large file '{}'. This is only done once.") logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc=name, disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with open(fname, "rb") as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) pbar.update(len(data)) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): raise ValueError("Can't open a dir") cache_file = self.cache.local.checksum_to_path_info(out.checksum) cache_file = fspath_py35(cache_file) if os.path.exists(cache_file): return _open(cache_file, mode=mode, encoding=encoding) try: remote_obj = self.cloud.get_remote(remote) remote_info = remote_obj.checksum_to_path_info(out.checksum) return remote_obj.open(remote_info, mode=mode, encoding=encoding) except RemoteActionNotImplemented: with self.state: cache_info = out.get_used_cache(remote=remote) self.cloud.pull(cache_info, remote=remote) return _open(cache_file, mode=mode, encoding=encoding)
def open(self, path_info, mode="r", encoding=None): assert mode in {"r", "rt", "rb"} return open(fspath_py35(path_info), mode=mode, encoding=encoding)
def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0]