def test_mnist(self): with self.gcs_access(): mnist = tfds.image_classification.MNIST( data_dir=gcs_utils.gcs_path('datasets')) example = next( tfds.as_numpy(mnist.as_dataset(split='train').take(1))) _ = example['image'], example['label']
def initialize_from_bucket(self): """Initialize DatasetInfo from GCS bucket info files.""" # In order to support Colab, we use the HTTP GCS API to access the metadata # files. They are copied locally and then loaded. tmp_dir = tempfile.mkdtemp("tfds") data_files = gcs_utils.gcs_dataset_info_files(self.full_name) if not data_files: return logging.info("Load pre-computed DatasetInfo (eg: splits, num examples,...) " "from GCS: %s", self.full_name) for fname in data_files: out_fname = os.path.join(tmp_dir, os.path.basename(fname)) tf.io.gfile.copy(gcs_utils.gcs_path(fname), out_fname) self.read_from_directory(tmp_dir)
def load( name: str, *, split: Optional[Tree[splits_lib.Split]] = None, data_dir: Optional[str] = None, batch_size: Optional[int] = None, shuffle_files: bool = False, download: bool = True, as_supervised: bool = False, decoders: Optional[TreeDict[decode.Decoder]] = None, read_config: Optional[read_config_lib.ReadConfig] = None, with_info: bool = False, builder_kwargs: Optional[Dict[str, Any]] = None, download_and_prepare_kwargs: Optional[Dict[str, Any]] = None, as_dataset_kwargs: Optional[Dict[str, Any]] = None, try_gcs: bool = False, ): # pylint: disable=line-too-long """Loads the named dataset into a `tf.data.Dataset`. `tfds.load` is a convenience method that: 1. Fetch the `tfds.core.DatasetBuilder` by name: ```python builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs) ``` 2. Generate the data (when `download=True`): ```python builder.download_and_prepare(**download_and_prepare_kwargs) ``` 3. Load the `tf.data.Dataset` object: ```python ds = builder.as_dataset( split=split, as_supervised=as_supervised, shuffle_files=shuffle_files, read_config=read_config, decoders=decoders, **as_dataset_kwargs, ) ``` See: https://www.tensorflow.org/datasets/overview#load_a_dataset for more examples. If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, you can pass the return value to `tfds.as_numpy`. **Warning**: calling this function might potentially trigger the download of hundreds of GiB to disk. Refer to the `download` argument. Args: name: `str`, the registered name of the `DatasetBuilder` (the snake case version of the class name). This can be either `"dataset_name"` or `"dataset_name/config_name"` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `"foo_bar/a=True,b=3"` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to use the `"zoo"` config and pass to the builder keyword arguments `a=True` and `b=3`). split: Which split of the data to load (e.g. `'train'`, `'test'` `['train', 'test']`, `'train[80%:]'`,...). See our [split API guide](https://www.tensorflow.org/datasets/splits). If `None`, will return all splits in a `Dict[Split, tf.data.Dataset]` data_dir: `str`, directory to read/write data. Defaults to the value of the environment variable TFDS_DATA_DIR, if set, otherwise falls back to "~/tensorflow_datasets". batch_size: `int`, if set, add a batch dimension to examples. Note that variable length features will be 0-padded. If `batch_size=-1`, will return the full dataset as `tf.Tensor`s. shuffle_files: `bool`, whether to shuffle the input files. Defaults to `False`. download: `bool` (optional), whether to call `tfds.core.DatasetBuilder.download_and_prepare` before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is expected to be in `data_dir`. If `True` and the data is already in `data_dir`, `download_and_prepare` is a no-op. as_supervised: `bool`, if `True`, the returned `tf.data.Dataset` will have a 2-tuple structure `(input, label)` according to `builder.info.supervised_keys`. If `False`, the default, the returned `tf.data.Dataset` will have a dictionary with all the features. decoders: Nested dict of `Decoder` objects which allow to customize the decoding. The structure should match the feature structure, but only customized feature keys need to be present. See [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md) for more info. read_config: `tfds.ReadConfig`, Additional options to configure the input pipeline (e.g. seed, num parallel reads,...). with_info: `bool`, if True, tfds.load will return the tuple (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated with the builder. builder_kwargs: `dict` (optional), keyword arguments to be passed to the `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed through by default. download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow to control where to download and extract the cached data. If not set, cache_dir and manual_dir will automatically be deduced from data_dir. as_dataset_kwargs: `dict` (optional), keyword arguments passed to `tfds.core.DatasetBuilder.as_dataset`. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. Returns: ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a `dict<key: tfds.Split, value: tf.data.Dataset>`. If `batch_size=-1`, these will be full datasets as `tf.Tensor`s. ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load` will return a tuple `(ds, ds_info)` containing dataset information (version, features, splits, num_examples,...). Note that the `ds_info` object documents the entire dataset, regardless of the `split` requested. Split-specific information is available in `ds_info.splits`. """ # pylint: enable=line-too-long if builder_kwargs is None: builder_kwargs = {} # Set data_dir if try_gcs and gcs_utils.is_dataset_on_gcs(name): data_dir = gcs_utils.gcs_path("datasets") dbuilder = builder(name, data_dir=data_dir, **builder_kwargs) if download: download_and_prepare_kwargs = download_and_prepare_kwargs or {} dbuilder.download_and_prepare(**download_and_prepare_kwargs) if as_dataset_kwargs is None: as_dataset_kwargs = {} as_dataset_kwargs = dict(as_dataset_kwargs) as_dataset_kwargs.setdefault("split", split) as_dataset_kwargs.setdefault("as_supervised", as_supervised) as_dataset_kwargs.setdefault("batch_size", batch_size) as_dataset_kwargs.setdefault("decoders", decoders) as_dataset_kwargs.setdefault("shuffle_files", shuffle_files) as_dataset_kwargs.setdefault("read_config", read_config) ds = dbuilder.as_dataset(**as_dataset_kwargs) if with_info: return ds, dbuilder.info return ds
def builder(name: str, *, try_gcs: bool = False, **builder_kwargs: Any) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_kwargs: `dict` of keyword arguments passed to the `tfds.core.DatasetBuilder`. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'}) ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) # `try_gcs` currently only support non-community datasets if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = builder_kwargs.get('data_dir') if data_dir: raise ValueError( f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` ' 'explicitly set') builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets') # Community datasets if ns_name: raise NotImplementedError # First check whether code exists or not (imported datasets) try: cls = builder_cls(builder_name) except registered.DatasetNotFoundError as e: cls = None # Class not found not_found_error = e # Save the exception to eventually reraise # Eventually try loading from files first if _try_load_from_files_first(cls, **builder_kwargs): try: b = read_only_builder.builder_from_files(builder_name, **builder_kwargs) return b except registered.DatasetNotFoundError as e: pass # If code exists and loading from files was skipped (e.g. files not found), # load from the source code. if cls: with py_utils.try_reraise( prefix=f'Failed to construct dataset {name}: '): return cls(**builder_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error
def builder( name: str, *, data_dir: Optional[str] = None, try_gcs: bool = False, **builder_init_kwargs: Any ) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). data_dir: Path to the dataset(s). See `tfds.load` for more information. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_init_kwargs: `dict` of keyword arguments passed to the `DatasetBuilder`. These will override keyword arguments passed in `name`, if any. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ builder_name, builder_kwargs = dataset_name_and_kwargs_from_name_str(name) # Set data_dir. if try_gcs and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = gcs_utils.gcs_path("datasets") # Try loading the code (if it exists) try: cls = builder_cls(builder_name) except DatasetNotFoundError as e: if e.is_abstract: raise # Abstract can't be instanciated neither from code nor files. cls = None # Class not found not_found_error = e # Save the exception to eventually reraise version_explicitly_given = "version" in builder_kwargs # Try loading from files first: # * If code not present. # * If version explicitly given (backward/forward compatibility). # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest', # custom config,...), read from generation code. if (not cls or version_explicitly_given) and not builder_init_kwargs: builder_dir = find_builder_dir(name, data_dir=data_dir) if builder_dir is not None: # A generated dataset was found on disk return read_only_builder.builder_from_directory(builder_dir) # If loading from files was skipped (e.g. files not found), load from the # source code. if cls: with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "): return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error