def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) bar = False if size >= LARGE_FILE_SIZE: bar = True msg = "Computing md5 for a large file {}. This is only done once." logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc_truncate=name, disable=not bar, total=size, bytes=True, leave=False, ) as pbar: with open(fname, "rb") as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) pbar.update(len(data)) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with Tqdm( total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as pbar: download_files = pbar.wrap_fn( partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, )) with ThreadPoolExecutor(max_workers=self.JOBS) as executor: futures = [ executor.submit(download_files, from_info, to_info) for from_info, to_info in zip(from_infos, to_infos) ] # NOTE: unlike pulling/fetching cache, where we need to # download everything we can, not raising an error here might # turn very ugly, as the user might think that he has # downloaded a complete directory, while having a partial one, # which might cause unexpected results in his pipeline. for future in as_completed(futures): # NOTE: executor won't let us raise until all futures that # it has are finished, so we need to cancel them ourselves # before re-raising. exc = future.exception() if exc: for entry in futures: entry.cancel() raise exc
def file_md5(fname, tree=None): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.istextfile import istextfile from dvc.progress import Tqdm if tree: exists_func = tree.exists stat_func = tree.stat open_func = tree.open else: exists_func = os.path.exists stat_func = os.stat open_func = open if exists_func(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname, tree=tree) size = stat_func(fname).st_size no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( "Computing md5 for a large file '{}'. This is only done once." ) logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc=name, disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with open_func(fname, "rb") as fobj: _fobj_md5(fobj, hash_md5, binary, pbar.update) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def _get_plans(self, download, remote, status_info, status): cache = [] path_infos = [] names = [] checksums = [] for md5, info in Tqdm( status_info.items(), desc="Analysing status", unit="file" ): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) checksums.append(md5) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return from_infos, to_infos, names, checksums
def cache_exists(self, checksums, jobs=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ if not self.no_traverse: return list(set(checksums) & set(self.all())) with Tqdm(total=len(checksums), unit="md5") as pbar: def exists_with_progress(chunks): return self.batch_exists(chunks, callback=pbar.update_desc) with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) pbar.update_desc("", 0) # clear path name description return ret
def _upload(self, from_file, to_info, name=None, no_progress_bar=False, **pbar_args): total = os.path.getsize(from_file) with Tqdm( disable=no_progress_bar, total=total, bytes=True, desc=name, **pbar_args, ) as pbar: obj = self._get_obj(to_info) obj.upload_file( from_file, Callback=pbar.update, ExtraArgs=self.fs_args.get("s3_additional_kwargs"), Config=self._transfer_config, ) self.fs.invalidate_cache(self._with_bucket(to_info.parent))
def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): if not hash_info.dir_info: hash_info.dir_info = tree.cache.get_dir_cache(hash_info) hi = self.save_dir_info(hash_info.dir_info, hash_info) for entry_info, entry_hash in Tqdm( hi.dir_info.items(path_info), desc="Saving " + path_info.name, unit="file", ): self._save_file(entry_info, tree, entry_hash, save_link=False, **kwargs) if save_link: self.tree.state.save_link(path_info) if self.tree.exists(path_info): self.tree.state.save(path_info, hi) cache_info = self.tree.hash_to_path_info(hi.value) self.tree.state.save(cache_info, hi)
def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): if not hash_info.dir_info: hash_info.dir_info = tree.cache.get_dir_cache(hash_info) hi = self.save_dir_info(hash_info.dir_info, hash_info) for entry in Tqdm( hi.dir_info, desc="Saving " + path_info.name, unit="file" ): entry_info = path_info / entry[self.tree.PARAM_RELPATH] entry_hash = HashInfo( self.tree.PARAM_CHECKSUM, entry[self.tree.PARAM_CHECKSUM] ) self._save_file( entry_info, tree, entry_hash, save_link=False, **kwargs ) if save_link: self.tree.state.save_link(path_info) if self.tree.exists(path_info): self.tree.state.save(path_info, hi.value) cache_info = self.tree.hash_to_path_info(hi.value) self.tree.state.save(cache_info, hi.value)
def gdrive_upload_file( self, args, no_progress_bar=True, from_file="", progress_name="" ): item = self.drive.CreateFile( {"title": args["title"], "parents": [{"id": args["parent_id"]}]} ) with open(from_file, "rb") as fobj: total = os.path.getsize(from_file) with Tqdm.wrapattr( fobj, "read", desc=progress_name, total=total, disable=no_progress_bar, ) as wrapped: # PyDrive doesn't like content property setting for empty files # https://github.com/gsuitedevs/PyDrive/issues/121 if total: item.content = wrapped item.Upload() return item
def _checkout( self, targets=None, with_deps=False, force=False, recursive=False ): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError stages = set() targets = targets or [None] for target in targets: try: new = self.collect( target, with_deps=with_deps, recursive=recursive ) stages.update(new) except (StageFileDoesNotExistError, StageFileBadNameError) as exc: if not target: raise raise CheckoutErrorSuggestGit(target, exc) _cleanup_unused_links(self, self.stages) total = get_all_files_numbers(stages) if total == 0: logger.info("Nothing to do") failed = [] with Tqdm( total=total, unit="file", desc="Checkout", disable=total == 0 ) as pbar: for stage in stages: if stage.locked: logger.warning( "DVC-file '{path}' is locked. Its dependencies are" " not going to be checked out.".format(path=stage.relpath) ) failed.extend( stage.checkout(force=force, progress_callback=pbar.update_desc) ) if failed: raise CheckoutError(failed)
def _cache_exists_traverse( self, checksums, remote_checksums, jobs=None, name=None ): logger.debug( "Querying {} checksums via threaded traverse".format( len(checksums) ) ) traverse_prefixes = ["{:02x}".format(i) for i in range(1, 256)] if self.TRAVERSE_PREFIX_LEN > 2: traverse_prefixes += [ "{0:0{1}x}".format(i, self.TRAVERSE_PREFIX_LEN) for i in range(1, pow(16, self.TRAVERSE_PREFIX_LEN - 2)) ] with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(traverse_prefixes), unit="dir", ) as pbar: def list_with_update(prefix): ret = map( self.path_to_checksum, list(self.list_cache_paths(prefix=prefix)), ) pbar.update_desc( "Querying cache in '{}'".format(self.path_info / prefix) ) return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: in_remote = executor.map(list_with_update, traverse_prefixes,) remote_checksums.update( itertools.chain.from_iterable(in_remote) ) return list(checksums & remote_checksums)
def _gdrive_download_file(self, item_id, to_file, progress_desc, no_progress_bar): param = {"id": item_id} # it does not create a file on the remote gdrive_file = self._drive.CreateFile(param) gdrive_file.FetchMetadata(fields="fileSize") size = gdrive_file["fileSize"] with Tqdm( desc=progress_desc, disable=no_progress_bar, bytes=True, # explicit `bar_format` as `total` will be set by `update_to` bar_format=Tqdm.BAR_FMT_DEFAULT, ) as pbar: if size: gdrive_file.GetContentFile(to_file, callback=pbar.update_to) else: # PyDrive doesn't like downloading empty files # https://github.com/iterative/dvc/issues/4286 with open(to_file, "w"): pass
def list_hashes_exists(self, hashes, jobs=None, name=None): """Return list of the specified hashes which exist in this tree. Hashes will be queried individually. """ logger.debug("Querying {} hashes via object_exists".format( len(hashes))) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(hashes), unit="file", ) as pbar: def exists_with_progress(path_info): ret = self.exists(path_info) pbar.update_msg(str(path_info)) return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = map(self.hash_to_path_info, hashes) in_remote = executor.map(exists_with_progress, path_infos) ret = list(itertools.compress(hashes, in_remote)) return ret
def _upload(self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs): # First try to create parent directories self.makedirs(to_info.parent) file_size = os.path.getsize(from_file) with open(from_file, "rb") as fd: progress_context = ( nullcontext(fd) if file_size == 0 else Tqdm.wrapattr( fd, "read", total=None if no_progress_bar else file_size, leave=False, desc=to_info.url if name is None else name, disable=no_progress_bar, )) with progress_context as fd_wrapped: self._client.upload_to(buff=fd_wrapped, remote_path=to_info.path)
def _download_dir(self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode): from_infos = list(self.walk_files(from_info)) to_infos = (to_info / info.relative_to(from_info) for info in from_infos) with ThreadPoolExecutor(max_workers=self.JOBS) as executor: download_files = partial( self._download_file, name=name, no_progress_bar=True, file_mode=file_mode, dir_mode=dir_mode, ) futures = executor.map(download_files, from_infos, to_infos) with Tqdm( futures, total=len(from_infos), desc="Downloading directory", unit="Files", disable=no_progress_bar, ) as futures: return sum(futures)
def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" from dvc.exceptions import DvcException from dvc.progress import Tqdm from dvc.system import System name = name if name else os.path.basename(dest) total = os.stat(src).st_size if os.path.isdir(dest): dest = os.path.join(dest, os.path.basename(src)) try: System.reflink(src, dest) except DvcException: with Tqdm(desc=name, disable=no_progress_bar, total=total, bytes=True) as pbar: with open(src, "rb") as fsrc, open(dest, "wb+") as fdest: while True: buf = fsrc.read(LOCAL_CHUNK_SIZE) if not buf: break fdest.write(buf) pbar.update(len(buf))
def _save_dir( self, path_info, tree, hash_info, filter_info=None, **kwargs, ): if not hash_info.dir_info: hash_info.dir_info = tree.cache.get_dir_cache(hash_info) hi = self.save_dir_info(hash_info.dir_info, hash_info) for entry_info, entry_hash in Tqdm( hi.dir_info.items(path_info), desc="Saving " + path_info.name, unit="file", ): if filter_info and not entry_info.isin_or_eq(filter_info): continue self._save_file(entry_info, tree, entry_hash, **kwargs) cache_info = self.tree.hash_to_path_info(hi.value) self.tree.state.save(cache_info, hi) tree.state.save(path_info, hi)
def _checkout(self, targets=None, with_deps=False, force=False, recursive=False): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError stages = set() if not targets: targets = [None] _cleanup_unused_links(self) for target in targets: try: new = self.collect(target, with_deps=with_deps, recursive=recursive) stages.update(new) except (StageFileDoesNotExistError, StageFileBadNameError) as exc: if not target: raise raise CheckoutErrorSuggestGit(target, exc) total = get_all_files_numbers(stages) if total == 0: logger.info("Nothing to do") failed = [] with Tqdm(total=total, unit="file", desc="Checkout", disable=total == 0) as pbar: for stage in stages: failed.extend( stage.checkout(force=force, progress_callback=pbar.update_desc)) if failed: raise CheckoutError(failed)
def save( odb: "ObjectDB", obj: "HashFile", jobs: Optional[int] = None, **kwargs, ): if isinstance(obj, Tree): with ThreadPoolExecutor(max_workers=jobs) as executor: for future in Tqdm( as_completed( executor.submit( odb.add, entry.path_info, entry.fs, entry.hash_info, **kwargs, ) for _, entry in obj), desc="Saving files", total=len(obj), unit="file", ): future.result() odb.add(obj.path_info, obj.fs, obj.hash_info, **kwargs)
def _gdrive_upload_file( self, parent_id, title, no_progress_bar=False, from_file="", progress_name="", ): item = self._drive.CreateFile( {"title": title, "parents": [{"id": parent_id}]} ) with open(from_file, "rb") as fobj: total = os.path.getsize(from_file) with Tqdm.wrapattr( fobj, "read", desc=progress_name, total=total, disable=no_progress_bar, ) as wrapped: item.content = wrapped item.Upload() return item
def progress(*args, **kwargs) -> "Tqdm": from dvc.progress import Tqdm return Tqdm(*args, **kwargs)
def fetch_refspecs( self, url: str, refspecs: Iterable[str], force: Optional[bool] = False, on_diverged: Optional[Callable[[str, str], bool]] = None, ): from dulwich.client import get_transport_and_path from dulwich.objectspec import parse_reftuples from dulwich.porcelain import ( DivergedBranches, check_diverged, get_remote_repo, ) fetch_refs = [] def determine_wants(remote_refs): fetch_refs.extend( parse_reftuples( remote_refs, self.repo.refs, [os.fsencode(refspec) for refspec in refspecs], force=force, )) return [ remote_refs[lh] for (lh, _, _) in fetch_refs if remote_refs[lh] not in self.repo.object_store ] try: _remote, location = get_remote_repo(self.repo, url) client, path = get_transport_and_path(location) except Exception as exc: raise SCMError( f"'{url}' is not a valid Git remote or URL") from exc with Tqdm(desc="Fetching git refs", bar_format=self.BAR_FMT_NOTOTAL) as pbar: def progress(msg_b): msg = msg_b.decode("ascii").strip() pbar.update_msg(msg) pbar.refresh() logger.trace(msg) fetch_result = client.fetch( path, self.repo, progress=progress, determine_wants=determine_wants, ) for (lh, rh, _) in fetch_refs: try: if rh in self.repo.refs: check_diverged(self.repo, self.repo.refs[rh], fetch_result.refs[lh]) except DivergedBranches: if not force: overwrite = False if on_diverged: overwrite = on_diverged( os.fsdecode(rh), os.fsdecode(fetch_result.refs[lh])) if not overwrite: continue self.repo.refs[rh] = fetch_result.refs[lh]
def push_refspec( self, url: str, src: Optional[str], dest: str, force: bool = False, on_diverged: Optional[Callable[[str, str], bool]] = None, ): from dulwich.client import get_transport_and_path from dulwich.errors import NotGitRepository, SendPackError from dulwich.porcelain import ( DivergedBranches, check_diverged, get_remote_repo, ) dest_refs, values = self._push_dest_refs(src, dest) try: _remote, location = get_remote_repo(self.repo, url) client, path = get_transport_and_path(location) except Exception as exc: raise SCMError( f"'{url}' is not a valid Git remote or URL") from exc def update_refs(refs): new_refs = {} for ref, value in zip(dest_refs, values): if ref in refs: local_sha = self.repo.refs[ref] remote_sha = refs[ref] try: check_diverged(self.repo, remote_sha, local_sha) except DivergedBranches: if not force: overwrite = False if on_diverged: overwrite = on_diverged( os.fsdecode(ref), os.fsdecode(remote_sha), ) if not overwrite: continue new_refs[ref] = value return new_refs try: with Tqdm(desc="Pushing git refs", bar_format=self.BAR_FMT_NOTOTAL) as pbar: def progress(msg_b): msg = msg_b.decode("ascii").strip() pbar.update_msg(msg) pbar.refresh() logger.trace(msg) client.send_pack( path, update_refs, self.repo.object_store.generate_pack_data, progress=progress, ) except (NotGitRepository, SendPackError) as exc: raise SCMError("Git failed to push '{src}' to '{url}'") from exc
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug("Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.tree.path_info, )) if download: func = partial( _log_exceptions(remote.tree.download, "download"), dir_mode=self.tree.dir_mode, file_mode=self.tree.file_mode, ) status = STATUS_DELETED desc = "Downloading" else: func = _log_exceptions(remote.tree.upload, "upload") status = STATUS_NEW desc = "Uploading" if jobs is None: jobs = remote.tree.JOBS dir_status, file_status, dir_contents = self._status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) dir_plans, _ = self._get_plans(download, remote, dir_status, status) file_plans, missing_files = self._get_plans(download, remote, file_status, status) total = len(dir_plans[0]) + len(file_plans[0]) if total == 0: return 0 with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) with ThreadPoolExecutor(max_workers=jobs) as executor: if download: from_infos, to_infos, names, _ = ( d + f for d, f in zip(dir_plans, file_plans)) fails = sum(executor.map(func, from_infos, to_infos, names)) else: # for uploads, push files first, and any .dir files last file_futures = {} for from_info, to_info, name, hash_ in zip(*file_plans): file_futures[hash_] = executor.submit( func, from_info, to_info, name) dir_futures = {} for from_info, to_info, name, dir_hash in zip(*dir_plans): # if for some reason a file contained in this dir is # missing both locally and in the remote, we want to # push whatever file content we have, but should not # push .dir file for file_hash in missing_files: if file_hash in dir_contents[dir_hash]: logger.debug( "directory '%s' contains missing files," "skipping .dir file upload", name, ) break else: wait_futures = { future for file_hash, future in file_futures.items() if file_hash in dir_contents[dir_hash] } dir_futures[dir_hash] = executor.submit( self._dir_upload, func, wait_futures, from_info, to_info, name, ) fails = sum(future.result() for future in concat( file_futures.values(), dir_futures.values())) if fails: if download: remote.index.clear() raise DownloadError(fails) raise UploadError(fails) if not download: # index successfully pushed dirs for dir_hash, future in dir_futures.items(): if future.result() == 0: file_hashes = dir_contents[dir_hash] logger.debug("Indexing pushed dir '{}' with " "'{}' nested files".format( dir_hash, len(file_hashes))) remote.index.update([dir_hash], file_hashes) return len(dir_plans[0]) + len(file_plans[0])
def progress(*args, **kwargs) -> Tqdm: return Tqdm(*args, **kwargs)
def progress(self, *args, **kwargs) -> Tqdm: kwargs.setdefault("file", self.error_output) return Tqdm(*args, **kwargs)
def _process( self, named_cache, remote, jobs=None, show_checksums=False, download=False, ): logger.debug( "Preparing to {} '{}'".format( "download data from" if download else "upload data to", remote.path_info, ) ) if download: func = partial( remote.download, dir_mode=self._dir_mode, file_mode=self._file_mode, ) status = STATUS_DELETED desc = "Downloading" else: func = remote.upload status = STATUS_NEW desc = "Uploading" if jobs is None: jobs = remote.JOBS dir_status, file_status, dir_contents = self._status( named_cache, remote, jobs=jobs, show_checksums=show_checksums, download=download, ) dir_plans = self._get_plans(download, remote, dir_status, status) file_plans = self._get_plans(download, remote, file_status, status) total = len(dir_plans[0]) + len(file_plans[0]) if total == 0: return 0 with Tqdm(total=total, unit="file", desc=desc) as pbar: func = pbar.wrap_fn(func) with ThreadPoolExecutor(max_workers=jobs) as executor: if download: fails = sum(executor.map(func, *dir_plans)) fails += sum(executor.map(func, *file_plans)) else: # for uploads, push files first, and any .dir files last file_futures = {} for from_info, to_info, name, checksum in zip(*file_plans): file_futures[checksum] = executor.submit( func, from_info, to_info, name ) dir_futures = {} for from_info, to_info, name, dir_checksum in zip( *dir_plans ): wait_futures = { future for file_checksum, future in file_futures.items() if file_checksum in dir_contents[dir_checksum] } dir_futures[dir_checksum] = executor.submit( self._dir_upload, func, wait_futures, from_info, to_info, name, ) fails = sum( future.result() for future in concat( file_futures.values(), dir_futures.values() ) ) if fails: if download: remote.index.clear() raise DownloadError(fails) raise UploadError(fails) if not download: # index successfully pushed dirs for dir_checksum, future in dir_futures.items(): if future.result() == 0: file_checksums = dir_contents[dir_checksum] logger.debug( "Indexing pushed dir '{}' with " "'{}' nested files".format( dir_checksum, len(file_checksums) ) ) remote.index.update([dir_checksum], file_checksums) return len(dir_plans[0]) + len(file_plans[0])
def checkout( self, targets=None, with_deps=False, force=False, relink=False, recursive=False, allow_persist_missing=False, ): from dvc.stage.exceptions import ( StageFileBadNameError, StageFileDoesNotExistError, ) unused = [] stats = { "added": [], "deleted": [], "modified": [], "failed": [], } if not targets: targets = [None] unused = _get_unused_links(self) stats["deleted"] = [_fspath_dir(u) for u in unused] self.state.remove_links(unused) if isinstance(targets, str): targets = [targets] pairs = set() for target in targets: try: pairs.update( self.collect_granular( target, with_deps=with_deps, recursive=recursive ) ) except ( StageFileDoesNotExistError, StageFileBadNameError, NoOutputOrStageError, ) as exc: if not target: raise raise CheckoutErrorSuggestGit(target) from exc total = get_all_files_numbers(pairs) with Tqdm( total=total, unit="file", desc="Checkout", disable=total == 0 ) as pbar: for stage, filter_info in pairs: result = stage.checkout( force=force, progress_callback=pbar.update_msg, relink=relink, filter_info=filter_info, allow_persist_missing=allow_persist_missing, ) for key, items in result.items(): stats[key].extend(_fspath_dir(path) for path in items) if stats.get("failed"): raise CheckoutError(stats["failed"], stats) del stats["failed"] return stats
def cache_exists(self, checksums, jobs=None): return [ checksum for checksum in Tqdm(checksums, unit="md5") if not self.changed_cache_file(checksum) ]
def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): from dvc.progress import Tqdm with Tqdm(bytes=True, disable=no_progress_bar, **pbar_args) as pbar: with pbar.wrapattr(fobj, "read") as fobj: self.copy_fobj(fobj, to_info, chunk_size=self.CHUNK_SIZE)