def download( self, from_infos, to_infos, names=None, no_progress_bar=False, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info.scheme != self.scheme: raise NotImplementedError if to_info.scheme != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) tmp_file = tmp_fname(to_info) if not name: name = to_info.name cb = None if no_progress_bar else Callback(name) makedirs(fspath_py35(to_info.parent), exist_ok=True) try: self.oss_service.get_object_to_file(from_info.path, tmp_file, progress_callback=cb) except Exception: logger.warning("failed to download '{}'".format(from_info)) else: move(tmp_file, fspath_py35(to_info)) finally: if not no_progress_bar: progress.finish_target(name)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) s3 = self.s3 for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != 's3': raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError Logger.debug("Uploading '{}' to '{}/{}'".format( from_info['path'], to_info['bucket'], to_info['key'])) if not name: name = os.path.basename(from_info['path']) total = os.path.getsize(from_info['path']) cb = Callback(name, total) try: s3.upload_file(from_info['path'], to_info['bucket'], to_info['key'], Callback=cb) except Exception as exc: msg = "Failed to upload '{}'".format(from_info['path']) Logger.error(msg, exc) continue progress.finish_target(name)
def upload(self, paths, path_infos, names=None): names = self._verify_path_args(path_infos, paths, names) gs = self.gs for path, path_info, name in zip(paths, path_infos, names): if path_info['scheme'] != 'gs': raise NotImplementedError Logger.debug("Uploading '{}' to '{}/{}'".format( path, path_info['bucket'], path_info['key'])) if not name: name = os.path.basename(path) progress.update_target(name, 0, None) try: gs.bucket(path_info['bucket']).blob( path_info['key']).upload_from_filename(path) except Exception as exc: Logger.error( "Failed to upload '{}' to '{}/{}'".format( path, path_info['bucket'], path_info['key']), exc) continue progress.finish_target(name)
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to collect status from {}".format(remote.url)) title = "Collecting information" ret = {} progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) ret = self._group(checksum_infos, show_checksums=show_checksums) md5s = list(ret.keys()) progress.update_target(title, 30, 100) remote_exists = list(remote.cache_exists(md5s)) progress.update_target(title, 90, 100) local_exists = self.cache_exists(md5s) progress.finish_target(title) for md5, info in ret.items(): info["status"] = STATUS_MAP[(md5 in local_exists, md5 in remote_exists)] return ret
def copyfile(src, dest): '''Copy file with progress bar''' copied = 0 name = os.path.basename(src) total = os.stat(src).st_size fsrc = open(src, 'rb') if os.path.isdir(dest): fdest = open(dest + '/' + name, 'wb+') else: fdest = open(dest, 'wb+') while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) copied += len(buf) progress.update_target(name, copied, total) progress.finish_target(name) fsrc.close() fdest.close()
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) s3 = self.s3 for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != "s3": raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}/{}'".format( from_info["path"], to_info["bucket"], to_info["path"])) if not name: name = os.path.basename(from_info["path"]) total = os.path.getsize(from_info["path"]) cb = Callback(name, total) try: s3.upload_file( from_info["path"], to_info["bucket"], to_info["path"], Callback=cb, ) except Exception: msg = "failed to upload '{}'".format(from_info["path"]) logger.error(msg) continue progress.finish_target(name)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != self.scheme: raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError bucket = to_info["bucket"] path = to_info["path"] logger.debug( "Uploading '{}' to '{}/{}'".format( from_info["path"], bucket, path ) ) if not name: name = os.path.basename(from_info["path"]) cb = Callback(name) try: self.blob_service.create_blob_from_path( bucket, path, from_info["path"], progress_callback=cb ) except Exception: msg = "failed to upload '{}'".format(from_info["path"]) logger.warning(msg) else: progress.finish_target(name)
def push(self, data_item): """ push, aws version """ aws_key = self.cache_file_key(data_item.resolved_cache.dvc) bucket = self._get_bucket_aws(self.storage_bucket) key = bucket.get_key(aws_key) if key: Logger.debug( 'File already uploaded to the cloud. Checksum validation...') if self._cmp_checksum(key, data_item.resolved_cache.dvc): Logger.debug('File checksum matches. No uploading is needed.') return data_item Logger.debug('Checksum miss-match. Re-uploading is required.') key = bucket.new_key(aws_key) try: self._push_multipart(key, data_item.resolved_cache.relative) except Exception as exc: Logger.error('Failed to upload "{}": {}'.format( data_item.resolved_cache.relative, exc)) return None progress.finish_target( os.path.basename(data_item.resolved_cache.relative)) return data_item
def _import(self, bucket_name, key, fname, data_item): bucket = self._get_bucket_gc(bucket_name) name = os.path.basename(fname) tmp_file = self.tmp_file(fname) blob = bucket.get_blob(key) if not blob: Logger.error('File "{}" does not exist in the cloud'.format(key)) return None if self._cmp_checksum(blob, fname): Logger.debug('File "{}" matches with "{}".'.format(fname, key)) return data_item Logger.debug('Downloading cache file from gc "{}/{}"'.format( bucket.name, key)) # percent_cb is not available for download_to_filename, so # lets at least update progress at keypoints(start, finish) progress.update_target(name, 0, None) try: blob.download_to_filename(tmp_file) os.rename(tmp_file, fname) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key, exc)) return None progress.finish_target(name) Logger.debug('Downloading completed') return data_item
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) gs = self.gs for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != 'gs': raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError Logger.debug("Uploading '{}' to '{}/{}'".format(from_info['path'], to_info['bucket'], to_info['key'])) if not name: name = os.path.basename(from_info['path']) progress.update_target(name, 0, None) try: bucket = gs.bucket(to_info['bucket']) blob = bucket.blob(to_info['key']) blob.upload_from_filename(from_info['path']) except Exception as exc: msg = "Failed to upload '{}' to '{}/{}'" Logger.warn(msg.format(from_info['path'], to_info['bucket'], to_info['key']), exc) continue progress.finish_target(name)
def _import(self, bucket_name, key_name, fname, data_item): bucket = self._get_bucket_aws(bucket_name) tmp_file = self.tmp_file(fname) name = os.path.basename(fname) key = bucket.get_key(key_name) if not key: Logger.error( 'File "{}" does not exist in the cloud'.format(key_name)) return None if self._cmp_checksum(key, fname): Logger.debug('File "{}" matches with "{}".'.format( fname, key_name)) return data_item Logger.debug('Downloading cache file from S3 "{}/{}" to "{}"'.format( bucket.name, key_name, fname)) res_h = ResumableDownloadHandler( tracker_file_name=self._download_tracker(tmp_file), num_retries=10) try: key.get_contents_to_filename(tmp_file, cb=create_cb(name), res_download_handler=res_h) os.rename(tmp_file, fname) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key_name, exc)) return None progress.finish_target(name) Logger.debug('Downloading completed') return data_item
def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): names = self._verify_path_args(to_infos, from_infos, names) gs = self.gs for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info.scheme != "gs": raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) if not name: name = from_info.name if not no_progress_bar: progress.update_target(name, 0, None) try: bucket = gs.bucket(to_info.bucket) blob = bucket.blob(to_info.path) blob.upload_from_filename(from_info.fspath) except Exception: msg = "failed to upload '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) continue progress.finish_target(name)
def download(self, path_infos, fnames, no_progress_bar=False, names=None): names = self._verify_path_args(path_infos, fnames, names) for fname, path_info, name in zip(fnames, path_infos, names): if path_info['scheme'] != self.scheme: raise NotImplementedError bucket = path_info['bucket'] key = path_info['key'] Logger.debug("Downloading '{}/{}' to '{}'".format( bucket, key, fname)) tmp_file = self.tmp_file(fname) if not name: name = os.path.basename(fname) cb = None if no_progress_bar else Callback(name) self._makedirs(fname) try: self.blob_service.get_blob_to_path( bucket, key, tmp_file, progress_callback=cb) except Exception as exc: Logger.error("Failed to download '{}/{}'".format( bucket, key), exc) else: os.rename(tmp_file, fname) if not no_progress_bar: progress.finish_target(name)
def upload(self, paths, path_infos, names=None): names = self._verify_path_args(path_infos, paths, names) for path, path_info, name in zip(paths, path_infos, names): if path_info['scheme'] != self.scheme: raise NotImplementedError bucket = path_info['bucket'] key = path_info['key'] Logger.debug("Uploading '{}' to '{}/{}'".format( path, bucket, key)) if not name: name = os.path.basename(path) cb = Callback(name) try: self.blob_service.create_blob_from_path( bucket, key, path, progress_callback=cb) except Exception as ex: Logger.error("Failed to upload '{}'".format(path), ex) else: progress.finish_target(name)
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): logger.info("Preparing to collect status from {}".format(remote.url)) title = "Collecting information" ret = {} progress.set_n_total(1) progress.update_target(title, 0, 100) progress.update_target(title, 10, 100) ret = self._group(checksum_infos, show_checksums=show_checksums) md5s = list(ret.keys()) progress.update_target(title, 30, 100) remote_exists = list(remote.cache_exists(md5s)) progress.update_target(title, 90, 100) local_exists = self.cache_exists(md5s) progress.finish_target(title) self._fill_statuses(ret, local_exists, remote_exists) self._log_missing_caches(ret) return ret
def push(self, data_item): """ push, gcp version """ bucket = self._get_bucket_gc(self.storage_bucket) blob_name = self.cache_file_key(data_item.resolved_cache.dvc) name = os.path.basename(data_item.resolved_cache.dvc) blob = bucket.get_blob(blob_name) if blob is not None and blob.exists(): if self._cmp_checksum(blob, data_item.resolved_cache.dvc): Logger.debug('checksum %s matches. Skipping upload' % data_item.cache.relative) return data_item Logger.debug('checksum %s mismatch. re-uploading' % data_item.cache.relative) # same as in _import progress.update_target(name, 0, None) blob = bucket.blob(blob_name) blob.upload_from_filename(data_item.resolved_cache.relative) progress.finish_target(name) Logger.debug('uploading %s completed' % data_item.resolved_cache.relative) return data_item
def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) if not name: name = from_info.name cb = None if no_progress_bar else Callback(name) try: self.blob_service.create_blob_from_path( to_info.bucket, to_info.path, from_info.fspath, progress_callback=cb, ) except Exception: msg = "failed to upload '{}'".format(from_info) logger.warning(msg) else: progress.finish_target(name)
def _save_dir(self, path_info): path = path_info['path'] md5, dir_info = self.state.update_info(path) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_MD5] p = os.path.join(path, relpath) c = self.get(m) if self.changed_cache(m): self._move(p, c) else: remove(p) self.link(c, p) if bar: progress.update_target(dir_relpath, processed, dir_size) self.state.update_link(path) if bar: progress.finish_target(dir_relpath) return {self.PARAM_MD5: md5}
def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" from dvc.exceptions import DvcException from dvc.progress import progress from dvc.system import System copied = 0 name = name if name else os.path.basename(dest) total = os.stat(src).st_size if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) try: System.reflink(src, dest) except DvcException: with open(src, "rb") as fsrc, open(dest, "wb+") as fdest: while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) copied += len(buf) if not no_progress_bar: progress.update_target(name, copied, total) if not no_progress_bar: progress.finish_target(name)
def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): raise RemoteActionNotImplemented("upload", self.scheme) if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format(from_info, to_info)) name = name or from_info.name if not no_progress_bar: progress.update_target(name, 0, None) try: self._upload( from_info.fspath, to_info, name=name, no_progress_bar=no_progress_bar, ) except Exception: msg = "failed to upload '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail if not no_progress_bar: progress.finish_target(name) return 0
def _save_dir(self, path, md5): dir_info = self.load_dir_cache(md5) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_CHECKSUM] p = os.path.join(path, relpath) c = self.get(m) if self.changed_cache(m): self._move(p, c) else: remove(p) self.link(c, p) self.state.update(p, m) self.state.update(c, m) if bar: progress.update_target(dir_relpath, processed, dir_size) self.state.update_link(path) cache = self.get(md5) self.state.update(cache) self.state.update(path, md5) if bar: progress.finish_target(dir_relpath)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) gs = self.gs for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != "gs": raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}/{}'".format( from_info["path"], to_info["bucket"], to_info["path"])) if not name: name = os.path.basename(from_info["path"]) progress.update_target(name, 0, None) try: bucket = gs.bucket(to_info["bucket"]) blob = bucket.blob(to_info["path"]) blob.upload_from_filename(from_info["path"]) except Exception: msg = "failed to upload '{}' to '{}/{}'" logger.error( msg.format(from_info["path"], to_info["bucket"], to_info["path"])) continue progress.finish_target(name)
def _get_chunks(self, download, remote, status_info, status, jobs): title = "Analysing status." progress.set_n_total(1) total = len(status_info) current = 0 cache = [] path_infos = [] names = [] for md5, info in status_info.items(): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) current += 1 progress.update_target(title, current, total) progress.finish_target(title) progress.set_n_total(len(names)) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return list( zip( to_chunks(from_infos, jobs), to_chunks(to_infos, jobs), to_chunks(names, jobs), ))
def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info.scheme != self.scheme: raise NotImplementedError if from_info.scheme != "local": raise NotImplementedError bucket = to_info.bucket path = to_info.path logger.debug( "Uploading '{}' to 'oss://{}/{}'".format( from_info.path, bucket, path ) ) if not name: name = os.path.basename(from_info.path) cb = None if no_progress_bar else Callback(name) try: self.oss_service.put_object_from_file( path, from_info.path, progress_callback=cb ) except Exception: msg = "failed to upload '{}'".format(from_info.path) logger.warning(msg) else: progress.finish_target(name)
def _pull_key(self, key, path, no_progress_bar=False): self._makedirs(path) name = os.path.relpath(path, self._cloud_settings.cache.cache_dir) tmp_file = self.tmp_file(path) if self._cmp_checksum(key, path): Logger.debug('File "{}" matches with "{}".'.format(path, key.name)) return path Logger.debug('Downloading cache file from gc "{}/{}"'.format( key.bucket.name, key.name)) if not no_progress_bar: # percent_cb is not available for download_to_filename, so # lets at least update progress at keypoints(start, finish) progress.update_target(name, 0, None) try: key.download_to_filename(tmp_file) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key.name, exc)) return None os.rename(tmp_file, path) if not no_progress_bar: progress.finish_target(name) Logger.debug('Downloading completed') return path
def status(self, checksum_infos, remote, jobs=1, show_checksums=False): Logger.info("Preparing to pull data from {}".format(remote.url)) title = "Collecting information" progress.set_n_total(0) progress.update_target(title, 0, 100) checksum_infos, missing = self._collect(checksum_infos) checksum_infos += missing progress.update_target(title, 10, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) progress.update_target(title, 20, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 30, 100) remote_exists = remote.exists(path_infos) progress.update_target(title, 90, 100) local_exists = [not self.changed(md5) for md5 in md5s] progress.finish_target(title) return [(name, STATUS_MAP[l, r]) for name, l, r in zip(names, local_exists, remote_exists)]
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) s3 = self.s3 for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info["scheme"] != "s3": raise NotImplementedError if to_info["scheme"] == "s3": self.copy(from_info, to_info, s3=s3) continue if to_info["scheme"] != "local": raise NotImplementedError msg = "Downloading '{}/{}' to '{}'".format( from_info["bucket"], from_info["path"], to_info["path"] ) logger.debug(msg) tmp_file = tmp_fname(to_info["path"]) if not name: name = os.path.basename(to_info["path"]) makedirs(os.path.dirname(to_info["path"]), exist_ok=True) try: if no_progress_bar: cb = None else: total = s3.head_object( Bucket=from_info["bucket"], Key=from_info["path"] )["ContentLength"] cb = Callback(name, total) s3.download_file( from_info["bucket"], from_info["path"], tmp_file, Callback=cb, ) except Exception: msg = "failed to download '{}/{}'".format( from_info["bucket"], from_info["path"] ) logger.error(msg) continue move(tmp_file, to_info["path"]) if not no_progress_bar: progress.finish_target(name)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != self.scheme: raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError bucket = to_info['bucket'] key = to_info['key'] Logger.debug("Uploading '{}' to '{}/{}'".format( from_info['path'], bucket, key)) if not name: name = os.path.basename(from_info['path']) cb = Callback(name) try: self.blob_service.create_blob_from_path(bucket, key, from_info['path'], progress_callback=cb) except Exception as ex: msg = "Failed to upload '{}'".format(from_info['path']) Logger.warn(msg, ex) else: progress.finish_target(name)
def copyfile(src, dest, no_progress_bar=False, name=None): '''Copy file with progress bar''' copied = 0 name = name if name else os.path.basename(dest) total = os.stat(src).st_size fsrc = open(src, 'rb') if os.path.isdir(dest): fdest = open(os.path.join(dest, os.path.basename(src)), 'wb+') else: fdest = open(dest, 'wb+') while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) copied += len(buf) if not no_progress_bar: progress.update_target(name, copied, total) if not no_progress_bar: progress.finish_target(name) fsrc.close() fdest.close()
def download_file(self, from_url, to_file): """ Download single file from url. """ r = requests.get(from_url, stream=True) name = os.path.basename(from_url) chunk_size = 1024 * 100 downloaded = 0 last_reported = 0 report_bucket = 100 * 1024 * 10 total_length = r.headers.get('content-length') with open(to_file, 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): if not chunk: # filter out keep-alive new chunks continue downloaded += chunk_size last_reported += chunk_size if last_reported >= report_bucket: last_reported = 0 Logger.debug('Downloaded {}'.format( sizeof_fmt(downloaded))) # update progress bar progress.update_target(name, downloaded, total_length) f.write(chunk) # tell progress bar that this target is finished downloading progress.finish_target(name)