def tqdm(self): """Add a progression bar for the current extraction.""" with utils.async_tqdm(total=0, desc='Extraction completed...', unit=' file') as pbar_path: self._pbar_path = pbar_path yield
def download_gcs_dataset(dataset_name, local_dataset_dir, max_simultaneous_downloads=50): """Downloads prepared GCS dataset to local dataset directory.""" prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name) gcs_paths_to_dl = gcs_files(prefix) # Filter out the diffs folder if present filter_prefix = posixpath.join(prefix, "diffs") gcs_paths_to_dl = [ p for p in gcs_paths_to_dl if not p.startswith(filter_prefix) ] with utils.async_tqdm(total=len(gcs_paths_to_dl), desc="Dl Completed...", unit=" file") as pbar: def _copy_from_gcs(gcs_path): local_path = posixpath.join(local_dataset_dir, posixpath.basename(gcs_path)) download_gcs_file(gcs_path, local_path) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [ executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl ] for future in concurrent.futures.as_completed(futures): future.result()