def builder( self, ns_name: Optional[str], builder_name: str, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: """Returns the dataset builder.""" if 'data_dir' in builder_kwargs: raise ValueError( '`data_dir` cannot be set for data_dir-based community datasets. ' 'Dataset should already be generated.') if ns_name is None: raise AssertionError(f'No namespace found: {builder_name}') if ns_name not in self._ns2data_dir: close_matches = difflib.get_close_matches(ns_name, self._ns2data_dir, n=1) hint = f'\nDid you meant: {close_matches[0]}' if close_matches else '' raise KeyError( f'Namespace `{ns_name}` for `{builder_name}` not found. ' f'Should be one of {sorted(self._ns2data_dir)}{hint}') return read_only_builder.builder_from_files( builder_name, data_dir=self._ns2data_dir[ns_name], **builder_kwargs, )
def builder( self, name: utils.DatasetName, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: """Returns the dataset builder.""" data_dir = builder_kwargs.pop('data_dir', None) if data_dir: raise ValueError( '`data_dir` cannot be set for data_dir-based community datasets. ' f'Dataset should already be generated. Got: {data_dir}') if name.namespace is None: raise AssertionError(f'No namespace found: {name}') if name.namespace not in self._ns2data_dir: # pylint: disable=unsupported-membership-test close_matches = difflib.get_close_matches(name.namespace, self._ns2data_dir, n=1) hint = f'\nDid you meant: {close_matches[0]}' if close_matches else '' raise KeyError( f'Namespace `{name.namespace}` for `{name}` not found. ' f'Should be one of {sorted(self._ns2data_dir)}{hint}') return read_only_builder.builder_from_files( name.name, data_dir=self._ns2data_dir[name.namespace], **builder_kwargs, )
def test_builder_from_files_multi_dir( code_builder: dataset_builder.DatasetBuilder, tmp_path: pathlib.Path, ): some_dir = tmp_path / 'other' some_dir.mkdir() builder = read_only_builder.builder_from_files( code_builder.name, data_dir=[ code_builder._data_dir_root, some_dir, '/tmp/non-existing-dir/' ], ) assert builder.name == code_builder.name assert builder.data_dir == code_builder.data_dir
def test_metadata(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: builder = RandomShapedImageGenerator(data_dir=tmp_dir) builder.download_and_prepare() # Metadata should have been created self.assertEqual(builder.info.metadata, {"some_key": 123}) # Metadata should have been restored builder2 = RandomShapedImageGenerator(data_dir=tmp_dir) self.assertEqual(builder2.info.metadata, {"some_key": 123}) # Metadata should have been restored even if the builder code was not # available and we restored from files. builder3 = read_only_builder.builder_from_files( builder.name, data_dir=tmp_dir, ) self.assertEqual(builder3.info.metadata, {"some_key": 123})
def builder(name: str, *, try_gcs: bool = False, **builder_kwargs: Any) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_kwargs: `dict` of keyword arguments passed to the `tfds.core.DatasetBuilder`. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'}) ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) # `try_gcs` currently only support non-community datasets if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = builder_kwargs.get('data_dir') if data_dir: raise ValueError( f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` ' 'explicitly set') builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets') # Community datasets if ns_name: raise NotImplementedError # First check whether code exists or not (imported datasets) try: cls = builder_cls(builder_name) except registered.DatasetNotFoundError as e: cls = None # Class not found not_found_error = e # Save the exception to eventually reraise # Eventually try loading from files first if _try_load_from_files_first(cls, **builder_kwargs): try: b = read_only_builder.builder_from_files(builder_name, **builder_kwargs) return b except registered.DatasetNotFoundError as e: pass # If code exists and loading from files was skipped (e.g. files not found), # load from the source code. if cls: with py_utils.try_reraise( prefix=f'Failed to construct dataset {name}: '): return cls(**builder_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error