def save_url_infos( path: epath.Path, url_infos: Dict[str, UrlInfo], ) -> None: """Store given checksums and sizes for specific dataset. Content of file is never disgarded, only updated. This is to ensure that if process is killed right after first download finishes, checksums registered during previous runs aren't lost. It is the responsibility of the caller not to call function multiple times in parallel for a given dataset. Only original file content is updated. This means the entire set of new sizes and checksums must be given at every call. Args: path: Path to the resources. url_infos: dict, {url: (size_in_bytes, checksum)}. """ original_data = load_url_infos(path) if path.exists() else {} new_data = original_data.copy() new_data.update(url_infos) # Compare filenames separatelly, as filename field is eq=False if original_data == new_data and _filenames_equal(original_data, new_data): return lines = [ f'{url}\t{int(url_info.size)}\t{url_info.checksum}\t' f'{url_info.filename or ""}\n' for url, url_info in sorted(new_data.items()) ] path.write_text(''.join(lines), encoding='UTF-8')
def _get_cached_copy(file_path: epath.Path, max_age_days: int) -> Optional[str]: if file_path.exists(): stats = os.stat(file_path) modified_time = datetime.datetime.fromtimestamp(stats.st_mtime) if modified_time > datetime.datetime.now() - datetime.timedelta( days=max_age_days): return file_path.read_text() return None
def build_and_save_community_catalog(catalog_dir: epath.Path) -> None: """Builds and saves the catalog of community datasets.""" templates = DocumentationTemplates.load() formatter_per_namespace = _get_formatter_per_namespace(templates) overview = build_overview(formatter_per_namespace, templates) catalog_dir.joinpath('overview.md').write_text(overview) # Write the `_toc.yaml` TF documentation navigation bar toc_yaml = build_toc_yaml(formatter_per_namespace) with catalog_dir.joinpath('_toc.yaml').open('w') as f: yaml.dump(toc_yaml, f, default_flow_style=False) for namespace, details in build_namespace_details(formatter_per_namespace): namespace_file = catalog_dir / f'{namespace}.md' namespace_file.write_text(details)
def download_gcs_folder( gcs_folder: epath.Path, local_folder: epath.PathLike, max_simultaneous_downloads: int = 25, ) -> None: """Downloads prepared GCS folder to local folder.""" if _is_gcs_disabled: raise AssertionError('Cannot download from GCS when _is_gcs_disabled') # Filter out the diffs folder if present paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs'] with tqdm_utils.async_tqdm(total=len(paths_to_dl), desc='Dl Completed...', unit=' file') as pbar: def _copy(gcs_path_: epath.Path): # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file` tf.io.gfile.copy( os.fspath(gcs_path_), os.path.join(local_folder, gcs_path_.name), ) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [executor.submit(_copy, path) for path in paths_to_dl] for future in concurrent.futures.as_completed(futures): future.result()
def list_ds_packages_for_namespace( namespace: str, path: epath.Path, ) -> List[DatasetPackage]: """Returns the dataset names found in a specific directory. Directories that contain code should have the following structure: ``` <path>/ <dataset0>/ <dataset0>.py <dataset1>/ <dataset1>.py ... ``` Additional files or folders which are not detected as datasets will be ignored (e.g. `__init__.py`). Args: namespace: Namespace of the datasets path: The directory path containing the datasets. Returns: ds_packages: The dataset packages found in the directory (sorted for determinism). Raises: FileNotFoundError: If the path cannot be reached. """ if not path.exists(): # Should be fault-tolerant in the future raise FileNotFoundError(f'Could not find datasets at {path}') all_packages = [] for ds_path in path.iterdir(): source = get_dataset_source(ds_path) if source: pkg = DatasetPackage( name=naming.DatasetName(namespace=namespace, name=ds_path.name), source=source, ) all_packages.append(pkg) return all_packages
def _rename_and_get_final_dl_path( self, url: str, path: epath.Path, expected_url_info: Optional[checksums.UrlInfo], computed_url_info: Optional[checksums.UrlInfo], checksum_path: Optional[epath.Path], url_path: epath.Path, ) -> epath.Path: """Eventually rename the downloaded file if checksums were recorded.""" # `path` can be: # * Manually downloaded # * (cached) checksum_path # * (cached) url_path # * `tmp_dir/file` (downloaded path) if self._manual_dir and path.is_relative_to(self._manual_dir): return path # Manually downloaded data elif path == checksum_path: # Path already at final destination assert computed_url_info == expected_url_info # Sanity check return checksum_path # pytype: disable=bad-return-type elif path == url_path: if checksum_path: # Checksums were registered: Rename -> checksums_path resource_lib.rename_info_file(path, checksum_path, overwrite=True) return path.replace(checksum_path) else: # Checksums not registered: -> do nothing return path else: # Path was downloaded in tmp dir dst_path = checksum_path or url_path resource_lib.write_info_file( url=url, path=dst_path, dataset_name=self._dataset_name, original_fname=path.name, url_info=computed_url_info, ) path.replace(dst_path) path.parent.rmdir() # Cleanup tmp dir (will fail if dir not empty) return dst_path
def _compute_dir_hash(path: epath.Path) -> str: """Computes the checksums of the given directory deterministically.""" all_files = sorted(path.iterdir()) if any(f.is_dir() for f in all_files): raise ValueError('Installed package should only contains files.') # Concatenate the filenames and files content to create the directory hash all_checksums = [f.name for f in all_files] all_checksums += [checksums.compute_url_info(f).checksum for f in all_files] return hashlib.sha256(''.join(all_checksums).encode()).hexdigest()
def _maybe_iterdir(path: epath.Path) -> Iterator[epath.Path]: """Same as `path.iterdir()`, but don't fail if path does not exist.""" # Use try/except rather than `.exists()` to avoid an extra RPC call # per namespace try: for f in path.iterdir(): yield f except ( FileNotFoundError, tf.errors.NotFoundError, tf.errors.PermissionDeniedError, ) as e: pass
def get_dataset_source( ds_path: epath.Path, ) -> Optional[dataset_sources_lib.DatasetSource]: """Returns a `DatasetSource` instance if the given path corresponds to a dataset. To determine whether the given path contains a dataset, a simple heuristic is used that checks whether the path has the following structure: ``` <ds_name>/ <ds_name>.py ``` If so, all `.py`, `.txt`, `.tsv`, `.json` files will be added to the package. Args: ds_path: Path of the dataset module Returns: A `DatasetSource` instance if the path matches the expected file structure. """ filter_list = {'__init__.py'} suffixes_list = ('.txt', '.tsv', '.py', '.json') def is_interesting_file(fname: str) -> bool: return fname.endswith(suffixes_list) and fname not in filter_list if not ds_path.is_dir(): return None all_filenames = set(f.name for f in ds_path.iterdir()) if f'{ds_path.name}.py' not in all_filenames: return None return dataset_sources_lib.DatasetSource( root_path=ds_path, filenames=sorted( [fname for fname in all_filenames if is_interesting_file(fname)]), )
def _extract_split_files(data_dir: epath.Path) -> _SplitFilesDict: """Extract the files.""" files = sorted(data_dir.iterdir()) file_infos = [ naming.FilenameInfo.from_str(f.name) for f in files if naming.FilenameInfo.is_valid(f.name) ] if not file_infos: raise ValueError( f'No example files detected in {data_dir}. Make sure to follow the ' 'pattern: ' '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`') split_files = collections.defaultdict(list) for file_info in file_infos: split_files[file_info.split].append(file_info) return split_files
def _generate_examples(self, annotations: epath.Path, all_frames: epath.Path, video_range_to_use: Optional[Tuple[int, int]] = None): beam = tfds.core.lazy_imports.apache_beam annotations = json.loads(annotations.read_text()) video_id_to_tracks, videos = _build_annotations_index(annotations) height = self._builder_config.height width = self._builder_config.width only_frames_with_labels = self._builder_config.only_frames_with_labels data_example = {} def _frame_index(frame_filename): """Convert a video frame filename into a numerical index.""" basename = os.path.basename(os.fspath(frame_filename)) return int(basename.split('.')[0]) def _process_example(video_id): """Process a single video into a data example.""" video = videos[video_id] if only_frames_with_labels: frames_list = [all_frames / file for file in video['file_names']] else: video_dir = os.path.dirname(video['file_names'][0]) video_directory = all_frames / video_dir frames_list = list(video_directory.glob('*')) frames_list = sorted(frames_list, key=_frame_index) data_example['metadata'] = _create_metadata(video, height, width, len(frames_list)) data_example['tracks'] = [] track_annotations = video_id_to_tracks[video_id] for track in track_annotations: data_example['tracks'].append( _create_per_track_annotation(video, frames_list, track, height, width)) data_example['video'] = self._maybe_resize_video(frames_list) return data_example['metadata']['video_name'], data_example video_keys = list(videos.keys()) if video_range_to_use is not None: video_keys = video_keys[video_range_to_use[0]:video_range_to_use[1]] return beam.Create(video_keys) | beam.Map(_process_example)
def exists(path: epath.Path) -> bool: """Checks if path exists. Returns False if issues occur connecting to GCS.""" try: return path.exists() except GCS_UNAVAILABLE_EXCEPTIONS: # pylint: disable=catching-non-exception return False