def _upload_fobj(self, fobj, to_info): self.makedirs(to_info.parent) tmp_info = to_info.parent / tmp_fname("") try: copy_fobj_to_file(fobj, tmp_info) os.rename(tmp_info, to_info) except Exception: self.remove(tmp_info) raise
def copy(self, from_info, to_info): tmp_info = to_info.parent / tmp_fname(to_info.name) try: System.copy(from_info, tmp_info) os.chmod(tmp_info, self.file_mode) os.rename(tmp_info, to_info) except Exception: self.remove(tmp_info) raise
def __init__(self, **config): from fsspec.utils import infer_storage_options super().__init__(**config) self.url = config["url"] opts = infer_storage_options(self.url) if not opts["host"]: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), ) ) self._bucket = opts["host"] self._path = opts["path"].lstrip("/") self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_user_email = config.get( "gdrive_service_account_user_email" ) self._service_account_json_file_path = config.get( "gdrive_service_account_json_file_path" ) self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() tmp_dir = config["gdrive_credentials_tmp_dir"] assert tmp_dir self._gdrive_service_credentials_path = tmp_fname( os.path.join(tmp_dir, "") ) self._gdrive_user_credentials_path = ( tmp_fname(os.path.join(tmp_dir, "")) if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ) )
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) s3 = self.s3 for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != "s3": raise NotImplementedError if to_info.scheme == "s3": self.copy(from_info, to_info, s3=s3) continue if to_info.scheme != "local": raise NotImplementedError msg = "Downloading '{}/{}' to '{}'".format(from_info.bucket, from_info.path, to_info.path) logger.debug(msg) tmp_file = tmp_fname(to_info.path) if not name: name = os.path.basename(to_info.path) makedirs(os.path.dirname(to_info.path), exist_ok=True) try: if no_progress_bar: cb = None else: total = s3.head_object(Bucket=from_info.bucket, Key=from_info.path)["ContentLength"] cb = Callback(name, total) s3.download_file(from_info.bucket, from_info.path, tmp_file, Callback=cb) except Exception: msg = "failed to download '{}/{}'".format( from_info.bucket, from_info.path) logger.exception(msg) continue move(tmp_file, to_info.path) if not no_progress_bar: progress.finish_target(name)
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) gs = self.gs for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info["scheme"] != "gs": raise NotImplementedError if to_info["scheme"] == "gs": self.copy(from_info, to_info, gs=gs) continue if to_info["scheme"] != "local": raise NotImplementedError msg = "Downloading '{}/{}' to '{}'".format( from_info["bucket"], from_info["path"], to_info["path"] ) logger.debug(msg) tmp_file = tmp_fname(to_info["path"]) if not name: name = os.path.basename(to_info["path"]) if not no_progress_bar: # percent_cb is not available for download_to_filename, so # lets at least update progress at pathpoints(start, finish) progress.update_target(name, 0, None) makedirs(os.path.dirname(to_info["path"]), exist_ok=True) try: bucket = gs.bucket(from_info["bucket"]) blob = bucket.get_blob(from_info["path"]) blob.download_to_filename(tmp_file) except Exception: msg = "failed to download '{}/{}' to '{}'" logger.exception( msg.format( from_info["bucket"], from_info["path"], to_info["path"] ) ) continue move(tmp_file, to_info["path"]) if not no_progress_bar: progress.finish_target(name)
def copy_fobj(self, fobj, to_info): self.makedirs(to_info.parent) tmp_info = to_info.parent / tmp_fname(to_info.name) try: copy_fobj_to_file(fobj, tmp_info) os.chmod(tmp_info, self.file_mode) os.rename(tmp_info, to_info) except Exception: self.remove(tmp_info) raise
def put_file(self, from_file, to_info, callback=DEFAULT_CALLBACK, **kwargs): parent = self.path.parent(to_info) makedirs(parent, exist_ok=True) tmp_file = self.path.join(parent, tmp_fname()) copyfile(from_file, tmp_file, callback=callback) os.replace(tmp_file, to_info)
def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): makedirs(to_info.parent, exist_ok=True) tmp_file = tmp_fname(to_info) copyfile( from_file, tmp_file, name=name, no_progress_bar=no_progress_bar ) os.rename(tmp_file, fspath_py35(to_info))
def upload(self, src, dest, no_progress_bar=False, progress_title=None): self.makedirs(posixpath.dirname(dest)) tmp_file = tmp_fname(dest) if not progress_title: progress_title = posixpath.basename(dest) with Tqdm(desc=progress_title, disable=no_progress_bar, bytes=True) as pbar: self.sftp.put(src, tmp_file, callback=pbar.update_to) self.sftp.rename(tmp_file, dest)
def client(self): import pydata_google_auth from gdrivefs import GoogleDriveFileSystem tmp_path = tmp_fname() with open(tmp_path, "w") as stream: stream.write(os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA)) GoogleDriveFileSystem._connect_cache = partial( pydata_google_auth.load_user_credentials, tmp_path) return GoogleDriveFileSystem(token="cache")
def _get_dir_info_checksum(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.cache.path_info / tmp_fname("") self.cache.upload(from_info, to_info, no_progress_bar=True) checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX return checksum, to_info
def __init__(self, **config): super().__init__(**config) self.path_info = self.PATH_CLS(config["url"]) if not self.path_info.bucket: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), ) ) self._bucket = self.path_info.bucket self._path = self.path_info.path self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_user_email = config.get( "gdrive_service_account_user_email" ) self._service_account_json_file_path = config.get( "gdrive_service_account_json_file_path" ) self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() tmp_dir = config["gdrive_credentials_tmp_dir"] assert tmp_dir self._gdrive_service_credentials_path = tmp_fname( os.path.join(tmp_dir, "") ) self._gdrive_user_credentials_path = ( tmp_fname(os.path.join(tmp_dir, "")) if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ) )
def _upload_file(path_info, fs, odb, upload_odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = upload_odb.path_info / tmp_fname() with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) size = fs.getsize(path_info) upload_odb.fs.upload(stream, tmp_info, desc=path_info.name, total=size) odb.add(tmp_info, upload_odb.fs, stream.hash_info) return path_info, odb.get(stream.hash_info)
def copy(self, from_info, to_info, **_kwargs): with self.hdfs(to_info) as hdfs: # NOTE: this is how `hadoop fs -cp` works too: it copies through # your local machine. with hdfs.open(from_info.path, "rb") as from_fobj: tmp_info = to_info.parent / tmp_fname(to_info.name) try: with hdfs.open(tmp_info.path, "wb") as tmp_fobj: tmp_fobj.upload(from_fobj) hdfs.rename(tmp_info.path, to_info.path) except Exception: self.remove(tmp_info) raise
def _upload(self, from_file, to_info, **_kwargs): self.hadoop_fs("mkdir -p {}".format(to_info.parent.url), user=to_info.user) tmp_file = tmp_fname(to_info.url) self.hadoop_fs( "copyFromLocal {} {}".format(from_file, tmp_file), user=to_info.user, ) self.hadoop_fs("mv {} {}".format(tmp_file, to_info.url), user=to_info.user)
def _download_file(self, from_info, to_info, name, no_progress_bar): makedirs(to_info.parent, exist_ok=True) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download( # noqa, pylint: disable=no-member from_info, tmp_file, name=name, no_progress_bar=no_progress_bar ) move(tmp_file, to_info)
def upload(self, src, dest, no_progress_bar=False, progress_title=None): self.makedirs(posixpath.dirname(dest)) tmp_file = tmp_fname(dest) if no_progress_bar: self.sftp.put(src, tmp_file) else: if not progress_title: progress_title = posixpath.basename(dest) self.sftp.put(src, tmp_file, callback=create_cb(progress_title)) progress.finish_target(progress_title) self.sftp.rename(tmp_file, dest)
def _upload_file(path_info, fs, odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = odb.path_info / tmp_fname() with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) odb.fs.upload_fobj(stream, tmp_info, desc=path_info.name, size=fs.getsize(path_info)) obj = HashFile(tmp_info, odb.fs, stream.hash_info) return path_info, obj
def download( self, from_info, to_info, name=None, no_progress_bar=False, file_mode=None, dir_mode=None, ): if not hasattr(self, "_download"): raise RemoteActionNotImplemented("download", self.scheme) if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme == self.scheme != "local": self.copy(from_info, to_info) return 0 if to_info.scheme != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name if not no_progress_bar: # real progress is not always available, # lets at least show start and finish progress.update_target(name, 0, None) makedirs(to_info.parent, exist_ok=True, mode=dir_mode) tmp_file = tmp_fname(to_info) try: self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) except Exception: msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail move(tmp_file, to_info, mode=file_mode) if not no_progress_bar: progress.finish_target(name) return 0
def init_drive(self): self.client_id = self.config.get(Config.SECTION_GDRIVE_CLIENT_ID, None) self.client_secret = self.config.get( Config.SECTION_GDRIVE_CLIENT_SECRET, None) if not self.client_id or not self.client_secret: raise DvcException("Please specify Google Drive's client id and " "secret in DVC's config. Learn more at " "https://man.dvc.org/remote/add.") self.gdrive_user_credentials_path = ( tmp_fname(".dvc/tmp/") if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) else self.config.get( Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE, self.DEFAULT_USER_CREDENTIALS_FILE, ))
def copy(self, from_info, to_info, **_kwargs): # NOTE: hdfs.copy_file is not supported yet in pyarrow with self.hdfs(to_info) as hdfs: # NOTE: this is how `hadoop fs -cp` works too: it copies through # your local machine. with closing(hdfs.open_input_stream(from_info.path)) as from_fobj: tmp_info = to_info.parent / tmp_fname(to_info.name) try: with closing(hdfs.open_output_stream( tmp_info.path)) as tmp_fobj: shutil.copyfileobj(from_fobj, tmp_fobj) hdfs.move(tmp_info.path, to_info.path) except Exception: self.remove(tmp_info) raise
def _get_dir_info_hash(self, dir_info): # Sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) tree = self.cache.tree from_info = PathInfo(tmp) to_info = tree.path_info / tmp_fname("") tree.upload(from_info, to_info, no_progress_bar=True) typ, hash_ = tree.get_file_hash(to_info) return typ, hash_ + self.CHECKSUM_DIR_SUFFIX, to_info
def _download_file(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download(from_info, tmp_file, name=name, no_progress_bar=no_progress_bar) move(tmp_file, to_info, mode=file_mode)
def _get_dir_info_checksum(self, dir_info, path_info): to_info = copy(path_info) to_info.path = self.cache.ospath.join(self.cache.prefix, tmp_fname("")) tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) from_info = PathLOCAL(path=tmp) self.cache.upload([from_info], [to_info], no_progress_bar=True) checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX from_info = copy(to_info) to_info.path = self.cache.checksum_to_path(checksum) return checksum, from_info, to_info
def digest(self): from dvc.fs.memory import MemoryFileSystem from dvc.path_info import PathInfo from dvc.utils import tmp_fname memfs = MemoryFileSystem() path_info = PathInfo(tmp_fname("")) with memfs.open(path_info, "wb") as fobj: fobj.write(self.as_bytes()) self.fs = memfs self.path_info = path_info self.hash_info = get_file_hash(path_info, memfs, "md5") self.hash_info.value += ".dir" self.hash_info.size = self.size self.hash_info.nfiles = len(self)
def as_atomic(fs, to_info): from dvc.utils import tmp_fname tmp_info = fs.path.join(fs.path.parent(to_info), tmp_fname()) try: yield tmp_info except BaseException: # Handle stuff like KeyboardInterrupt # as well as other errors that might # arise during file transfer. with suppress(FileNotFoundError): fs.remove(tmp_info) raise else: fs.move(tmp_info, to_info)
def digest(self, hash_info: Optional["HashInfo"] = None): from dvc.fs.memory import MemoryFileSystem from dvc.utils import tmp_fname memfs = MemoryFileSystem() fs_path = "memory://{}".format(tmp_fname("")) with memfs.open(fs_path, "wb") as fobj: fobj.write(self.as_bytes()) self.fs = memfs self.fs_path = fs_path if hash_info: self.hash_info = hash_info else: _, self.hash_info = get_file_hash(fs_path, memfs, "md5") assert self.hash_info.value self.hash_info.value += ".dir"
def _upload_file(from_fs_path, fs, odb, upload_odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader fs_path = upload_odb.fs.path tmp_info = fs_path.join(upload_odb.fs_path, tmp_fname()) with fs.open(from_fs_path, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) size = fs.getsize(from_fs_path) upload_odb.fs.upload( stream, tmp_info, desc=fs_path.name(from_fs_path), total=size ) odb.add(tmp_info, upload_odb.fs, stream.hash_info) meta = Meta(size=size) return from_fs_path, meta, odb.get(stream.hash_info)
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info["scheme"] != self.scheme: raise NotImplementedError if to_info["scheme"] != "local": raise NotImplementedError bucket = from_info["bucket"] path = from_info["path"] logger.debug( "Downloading '{}/{}' to '{}'".format( bucket, path, to_info["path"] ) ) tmp_file = tmp_fname(to_info["path"]) if not name: name = os.path.basename(to_info["path"]) cb = None if no_progress_bar else Callback(name) makedirs(os.path.dirname(to_info["path"]), exist_ok=True) try: self.blob_service.get_blob_to_path( bucket, path, tmp_file, progress_callback=cb ) except Exception: msg = "failed to download '{}/{}'".format(bucket, path) logger.warning(msg) else: move(tmp_file, to_info["path"]) if not no_progress_bar: progress.finish_target(name)
def init_drive(self): self.client_id = self.config.get(Config.SECTION_GDRIVE_CLIENT_ID, None) self.client_secret = self.config.get( Config.SECTION_GDRIVE_CLIENT_SECRET, None) if not self.client_id or not self.client_secret: raise DvcException( "Please specify Google Drive's client id and " "secret in DVC's config. Learn more at " "{}.".format(format_link("https://man.dvc.org/remote/add"))) self.gdrive_user_credentials_path = ( tmp_fname(os.path.join(self.repo.tmp_dir, "")) if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) else self.config.get( Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE, os.path.join(self.repo.tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ))