def test_read_only_builder(code_builder: dataset_builder.DatasetBuilder): """Builder can be created from the files only.""" # Reconstruct the dataset builder = read_only_builder.builder_from_directory(code_builder.data_dir) assert builder.name == code_builder.name assert builder.data_dir == code_builder.data_dir assert builder.info.version == code_builder.info.version assert builder.info.full_name == code_builder.info.full_name assert repr(builder.info) == repr(code_builder.info) assert builder.VERSION == code_builder.info.version assert builder.__module__ == type(code_builder).__module__ assert read_only_builder.ReadOnlyBuilder.VERSION is None if code_builder.builder_config: assert builder.builder_config code_config = code_builder.builder_config file_config = builder.builder_config # Config attributes should be restored too assert code_config.name == file_config.name assert code_config.description == file_config.description assert code_config.version == file_config.version # Test that the dataset can be read ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) origin_ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) assert [ex['id'] for ex in ds] == [ex['id'] for ex in origin_ds] builder.download_and_prepare() # Should be a no-op
def test_write_metadata( tmp_path: pathlib.Path, file_format, ): tmp_path = utils.as_path(tmp_path) src_builder = testing.DummyDataset( data_dir=tmp_path / 'origin', file_format=file_format, ) src_builder.download_and_prepare() dst_dir = tmp_path / 'copy' dst_dir.mkdir() # Copy all the tfrecord files, but not the dataset info for f in src_builder.data_path.iterdir(): if naming.FilenameInfo.is_valid(f.name): f.copy(dst_dir / f.name) metadata_path = dst_dir / 'dataset_info.json' if file_format is None: split_infos = list(src_builder.info.splits.values()) else: split_infos = None # Auto-compute split infos assert not metadata_path.exists() write_metadata_utils.write_metadata( data_dir=dst_dir, features=src_builder.info.features, split_infos=split_infos, description='my test description.') assert metadata_path.exists() # After metadata are written, builder can be restored from the directory builder = read_only_builder.builder_from_directory(dst_dir) assert builder.name == 'dummy_dataset' assert builder.version == '1.0.0' assert set(builder.info.splits) == {'train'} assert builder.info.splits['train'].num_examples == 3 assert builder.info.description == 'my test description.' # Values are the same src_ds = src_builder.as_dataset(split='train') ds = builder.as_dataset(split='train') assert list(src_ds.as_numpy_iterator()) == list(ds.as_numpy_iterator())
def test_read_only_builder(code_builder: dataset_builder.DatasetBuilder): """Builder can be created from the files only.""" # Reconstruct the dataset builder = read_only_builder.builder_from_directory(code_builder.data_dir) assert builder.name == code_builder.name assert builder.data_dir == code_builder.data_dir assert builder.info.version == code_builder.info.version assert builder.info.full_name == code_builder.info.full_name assert repr(builder.info) == repr(code_builder.info) assert builder.VERSION is None # Test that the dataset can be read ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) origin_ds = dataset_utils.as_numpy( builder.as_dataset(split='train').take(5)) assert [ex['label'] for ex in ds] == [ex['label'] for ex in origin_ds] builder.download_and_prepare() # Should be a no-op
def test_read_only_builder( builder_cls: dataset_builder.DatasetBuilder, tmp_path: pathlib.Path, ): # Generate the dataset origin_builder = builder_cls(data_dir=tmp_path) origin_builder.download_and_prepare() # Reconstruct the dataset builder = read_only_builder.builder_from_directory(origin_builder.data_dir) assert builder.name == origin_builder.name assert builder.data_dir == origin_builder.data_dir assert builder.info.version == origin_builder.info.version assert builder.info.full_name == origin_builder.info.full_name assert repr(builder.info) == repr(origin_builder.info) assert builder.VERSION is None # Test that the dataset can be read ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) origin_ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) assert [ex['label'] for ex in ds] == [ex['label'] for ex in origin_ds] builder.download_and_prepare() # Should be a no-op
def builder( name: str, *, data_dir: Optional[str] = None, **builder_init_kwargs: Any ) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). data_dir: Path to the dataset(s). See `tfds.load` for more information. **builder_init_kwargs: `dict` of keyword arguments passed to the `DatasetBuilder`. These will override keyword arguments passed in `name`, if any. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name) # Try loading the code (if it exists) try: cls = builder_cls(builder_name) except DatasetNotFoundError as e: if e.is_abstract: raise # Abstract can't be instanciated neither from code nor files. cls = None # Class not found not_found_error = e # Save the exception to eventually reraise version_explicitly_given = "version" in builder_kwargs # Try loading from files first: # * If code not present. # * If version explicitly given (backward/forward compatibility). # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest', # custom config,...), read from generation code. if (not cls or version_explicitly_given) and not builder_init_kwargs: builder_dir = find_builder_dir(name, data_dir=data_dir) if builder_dir is not None: # A generated dataset was found on disk return read_only_builder.builder_from_directory(builder_dir) # If loading from files was skipped (e.g. files not found), load from the # source code. if cls: with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "): return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error
def test_not_exists(tmp_path: pathlib.Path): with pytest.raises( FileNotFoundError, match='Could not load `ReadOnlyBuilder`' ): read_only_builder.builder_from_directory(tmp_path)
def write_metadata( *, data_dir: epath.PathLike, features: features_lib.feature.FeatureConnectorArg, split_infos: Union[None, epath.PathLike, List[split_lib.SplitInfo]] = None, version: Union[None, str, utils.Version] = None, check_data: bool = True, **ds_info_kwargs, ) -> None: """Add metadata required to load with TFDS. See documentation for usage: https://www.tensorflow.org/datasets/external_tfrecord Args: data_dir: Dataset path on which to save the metadata features: dict of `tfds.features.FeatureConnector` matching the proto specs. split_infos: Can be either: * A path to the pre-computed split info values ( the `out_dir` kwarg of `tfds.folder_dataset.compute_split_info`) * A list of `tfds.core.SplitInfo` (returned value of `tfds.folder_dataset.compute_split_info`) * `None` to auto-compute the split info. version: Optional dataset version (auto-infer by default, or fallback to 1.0.0) check_data: If True, perform additional check to validate the data in data_dir is valid **ds_info_kwargs: Additional metadata forwarded to `tfds.core.DatasetInfo` ( description, homepage,...). Will appear in the doc. """ features = features_lib.features_dict.to_feature(features) data_dir = epath.Path(data_dir) # Extract the tf-record filenames tfrecord_files = [ f for f in data_dir.iterdir() if naming.FilenameInfo.is_valid(f.name) ] if not tfrecord_files: raise ValueError( f'Could not find tf-record (or compatible format) in {data_dir}. ' 'Make sure to follow the pattern: ' '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`') file_infos = [naming.FilenameInfo.from_str(f.name) for f in tfrecord_files] # Use set with tuple expansion syntax to ensure all names are consistents snake_name, = {f.dataset_name for f in file_infos} camel_name = naming.snake_to_camelcase(snake_name) filetype_suffix, = {f.filetype_suffix for f in file_infos} file_format = file_adapters.file_format_from_suffix(filetype_suffix) cls = types.new_class( camel_name, bases=(_WriteBuilder, ), kwds=dict(skip_registration=True), exec_body=None, ) if version is None: # Automatically detect the version if utils.Version.is_valid(data_dir.name): version = data_dir.name else: version = '1.0.0' cls.VERSION = utils.Version(version) # Create a dummy builder (use non existant folder to make sure # dataset_info.json is not restored) builder = cls(file_format=file_format, data_dir='/tmp/non-existent-dir/') # Create the metadata ds_info = dataset_info.DatasetInfo( builder=builder, features=features, **ds_info_kwargs, ) ds_info.set_file_format(file_format) # Add the split infos split_dict = _load_splits( data_dir=data_dir, split_infos=split_infos, file_infos=file_infos, filetype_suffix=filetype_suffix, builder=builder, ) ds_info.set_splits(split_dict) # Save all metadata (dataset_info.json, features.json,...) ds_info.write_to_directory(data_dir) # Make sure that the data can be loaded (feature connector match the actual # specs) if check_data: utils.print_notebook( 'Metadata written. Testing by reading first example. ' 'Set check_data=False to skip.') builder = read_only_builder.builder_from_directory(data_dir) split_name = next(iter(builder.info.splits)) _, = builder.as_dataset( split=f'{split_name}[:1]') # Load the first example
def test_builder_from_directory_dir_not_exists(tmp_path: pathlib.Path): with pytest.raises(FileNotFoundError, match='Could not load dataset info'): read_only_builder.builder_from_directory(tmp_path)