def _maybe_log_gcs_data_dir(self): """If data is on GCS, set _data_dir to GCS path.""" if not gcs_utils.is_dataset_on_gcs(self.info.full_name): return gcs_path = os.path.join(constants.GCS_DATA_DIR, self.info.full_name) msg = GCS_HOSTED_MSG.format(name=self.name, gcs_path=gcs_path, local_data_dir_no_version=os.path.split( self._data_dir)[0]) logging.info(msg)
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space(self.info.size_in_bytes, directory=self._data_dir_root): raise IOError("Not enough disk space. Needed: %s" % units.size_str(self.info.size_in_bytes)) self._log_download_bytes() # Create a tmp dir and rename to self._data_dir on successful exit. with file_format_adapter.incomplete_dir( self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Update DatasetInfo metadata by computing statistics from the data. if (download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and bool(self.info.splits.total_num_examples)): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.size_in_bytes = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def is_dataset_accessible(self): # Re-enable GCS access. TestCase disables it. with self.gcs_access(): self.assertTrue(gcs_utils.is_dataset_on_gcs("mnist/1.0.0"))
def download_and_prepare(self, download_dir=None, download_config=None): """Downloads and prepares dataset for reading. Args: download_dir: `str`, directory where downloaded files are stored. Defaults to "~/tensorflow-datasets/downloads". download_config: `tfds.download.DownloadConfig`, further configuration for downloading and preparing dataset. Raises: IOError: if there is not enough disk space available. """ download_config = download_config or download.DownloadConfig() data_exists = tf.io.gfile.exists(self._data_dir) if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS: logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Disable `download_and_prepare` (internally, we are still # allowing Py2 for the `dataset_builder_tests.py` & cie if _is_py2_download_and_prepare_disabled and six.PY2: raise NotImplementedError( "TFDS has dropped `builder.download_and_prepare` support for " "Python 2. Please update your code to Python 3.") if self.version.tfds_version_to_prepare: available_to_prepare = ", ".join( str(v) for v in self.versions if not v.tfds_version_to_prepare) raise AssertionError( "The version of the dataset you are trying to use ({}:{}) can only " "be generated using TFDS code synced @ {} or earlier. Either sync to " "that version of TFDS to first prepare the data or use another " "version of the dataset (available for `download_and_prepare`: " "{}).".format(self.name, self.version, self.version.tfds_version_to_prepare, available_to_prepare)) # Only `cls.VERSION` or `experimental_latest` versions can be generated. # Otherwise, users may accidentally generate an old version using the # code from newer versions. installable_versions = { str(v) for v in (self.canonical_version, max(self.versions)) } if str(self.version) not in installable_versions: msg = ( "The version of the dataset you are trying to use ({}) is too " "old for this version of TFDS so cannot be generated.").format( self.info.full_name) if self.version.tfds_version_to_prepare: msg += ( "{} can only be generated using TFDS code synced @ {} or earlier " "Either sync to that version of TFDS to first prepare the data or " "use another version of the dataset. ").format( self.version, self.version.tfds_version_to_prepare) else: msg += ( "Either sync to a previous version of TFDS to first prepare the " "data or use another version of the dataset. ") msg += "Available for `download_and_prepare`: {}".format( list(sorted(installable_versions))) raise ValueError(msg) # Currently it's not possible to overwrite the data because it would # conflict with versioning: If the last version has already been generated, # it will always be reloaded and data_dir will be set at construction. if data_exists: raise ValueError( "Trying to overwrite an existing dataset {} at {}. A dataset with " "the same version {} already exists. If the dataset has changed, " "please update the version number.".format( self.name, self._data_dir, self.version)) logging.info("Generating dataset %s (%s)", self.name, self._data_dir) if not utils.has_sufficient_disk_space( self.info.dataset_size + self.info.download_size, directory=self._data_dir_root): raise IOError( "Not enough disk space. Needed: {} (download: {}, generated: {})" .format( units.size_str(self.info.dataset_size + self.info.download_size), units.size_str(self.info.download_size), units.size_str(self.info.dataset_size), )) self._log_download_bytes() dl_manager = self._make_download_manager( download_dir=download_dir, download_config=download_config) # Create a tmp dir and rename to self._data_dir on successful exit. with utils.incomplete_dir(self._data_dir) as tmp_data_dir: # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward # it to every sub function. with utils.temporary_assignment(self, "_data_dir", tmp_data_dir): if (download_config.try_download_gcs and gcs_utils.is_dataset_on_gcs(self.info.full_name)): logging.warning(GCS_HOSTED_MSG, self.name) gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir) self.info.read_from_directory(self._data_dir) else: self._download_and_prepare(dl_manager=dl_manager, download_config=download_config) # NOTE: If modifying the lines below to put additional information in # DatasetInfo, you'll likely also want to update # DatasetInfo.read_from_directory to possibly restore these attributes # when reading from package data. # Skip statistics computation if tfdv isn't present try: import tensorflow_data_validation # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import # pytype: disable=import-error skip_stats_computation = False except ImportError: skip_stats_computation = True splits = list(self.info.splits.values()) statistics_already_computed = bool( splits and splits[0].statistics.num_examples) # Update DatasetInfo metadata by computing statistics from the data. if (skip_stats_computation or download_config.compute_stats == download.ComputeStatsMode.SKIP or download_config.compute_stats == download.ComputeStatsMode.AUTO and statistics_already_computed): logging.info("Skipping computing stats for mode %s.", download_config.compute_stats) else: # Mode is forced or stats do not exists yet logging.info("Computing statistics.") self.info.compute_dynamic_properties() self.info.download_size = dl_manager.downloaded_size # Write DatasetInfo to disk, even if we haven't computed statistics. self.info.write_to_directory(self._data_dir) self._log_download_done()
def test_is_dataset_accessible(self): # Re-enable GCS access. TestCase disables it. with self.gcs_access(): is_ds_on_gcs = gcs_utils.is_dataset_on_gcs('mnist/1.0.0') self.assertTrue(is_ds_on_gcs)
def load(name, split=None, data_dir=None, batch_size=None, shuffle_files=False, download=True, as_supervised=False, decoders=None, read_config=None, with_info=False, builder_kwargs=None, download_and_prepare_kwargs=None, as_dataset_kwargs=None, try_gcs=False): # pylint: disable=line-too-long """Loads the named dataset into a `tf.data.Dataset`. If `split=None` (the default), returns all splits for the dataset. Otherwise, returns the specified split. `load` is a convenience method that fetches the `tfds.core.DatasetBuilder` by string name, optionally calls `DatasetBuilder.download_and_prepare` (if `download=True`), and then calls `DatasetBuilder.as_dataset`. This is roughly equivalent to: ``` builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs) if download: builder.download_and_prepare(**download_and_prepare_kwargs) ds = builder.as_dataset( split=split, as_supervised=as_supervised, **as_dataset_kwargs) if with_info: return ds, builder.info return ds ``` If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, you can pass the return value to `tfds.as_numpy`. Callers must pass arguments as keyword arguments. **Warning**: calling this function might potentially trigger the download of hundreds of GiB to disk. Refer to the `download` argument. Args: name: `str`, the registered name of the `DatasetBuilder` (the snake case version of the class name). This can be either `"dataset_name"` or `"dataset_name/config_name"` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `"foo_bar/a=True,b=3"` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to use the `"zoo"` config and pass to the builder keyword arguments `a=True` and `b=3`). split: `tfds.Split` or `str`, which split of the data to load. If None, will return a `dict` with all splits (typically `tfds.Split.TRAIN` and `tfds.Split.TEST`). data_dir: `str` (optional), directory to read/write data. Defaults to "~/tensorflow_datasets". batch_size: `int`, if set, add a batch dimension to examples. Note that variable length features will be 0-padded. If `batch_size=-1`, will return the full dataset as `tf.Tensor`s. shuffle_files: `bool`, whether to shuffle the input files. Defaults to `False`. download: `bool` (optional), whether to call `tfds.core.DatasetBuilder.download_and_prepare` before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is expected to be in `data_dir`. If `True` and the data is already in `data_dir`, `download_and_prepare` is a no-op. as_supervised: `bool`, if `True`, the returned `tf.data.Dataset` will have a 2-tuple structure `(input, label)` according to `builder.info.supervised_keys`. If `False`, the default, the returned `tf.data.Dataset` will have a dictionary with all the features. decoders: Nested dict of `Decoder` objects which allow to customize the decoding. The structure should match the feature structure, but only customized feature keys need to be present. See [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md) for more info. read_config: `tfds.ReadConfig`, Additional options to configure the input pipeline (e.g. seed, num parallel reads,...). with_info: `bool`, if True, tfds.load will return the tuple (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated with the builder. builder_kwargs: `dict` (optional), keyword arguments to be passed to the `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed through by default. download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow to control where to download and extract the cached data. If not set, cache_dir and manual_dir will automatically be deduced from data_dir. as_dataset_kwargs: `dict` (optional), keyword arguments passed to `tfds.core.DatasetBuilder.as_dataset`. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. Returns: ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a `dict<key: tfds.Split, value: tfds.data.Dataset>`. If `batch_size=-1`, these will be full datasets as `tf.Tensor`s. ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load` will return a tuple `(ds, ds_info)` containing dataset information (version, features, splits, num_examples,...). Note that the `ds_info` object documents the entire dataset, regardless of the `split` requested. Split-specific information is available in `ds_info.splits`. """ # pylint: enable=line-too-long name, name_builder_kwargs = _dataset_name_and_kwargs_from_name_str(name) name_builder_kwargs.update(builder_kwargs or {}) builder_kwargs = name_builder_kwargs # Set data_dir if try_gcs and gcs_utils.is_dataset_on_gcs(name): data_dir = constants.GCS_DATA_DIR elif data_dir is None: data_dir = constants.DATA_DIR dbuilder = builder(name, data_dir=data_dir, **builder_kwargs) if download: download_and_prepare_kwargs = download_and_prepare_kwargs or {} dbuilder.download_and_prepare(**download_and_prepare_kwargs) if as_dataset_kwargs is None: as_dataset_kwargs = {} as_dataset_kwargs = dict(as_dataset_kwargs) as_dataset_kwargs.setdefault("split", split) as_dataset_kwargs.setdefault("as_supervised", as_supervised) as_dataset_kwargs.setdefault("batch_size", batch_size) as_dataset_kwargs.setdefault("decoders", decoders) as_dataset_kwargs.setdefault("shuffle_files", shuffle_files) as_dataset_kwargs.setdefault("read_config", read_config) ds = dbuilder.as_dataset(**as_dataset_kwargs) if with_info: return ds, dbuilder.info return ds
def load(name, split=None, data_dir=None, batch_size=1, in_memory=None, download=True, as_supervised=False, with_info=False, builder_kwargs=None, download_and_prepare_kwargs=None, as_dataset_kwargs=None, try_gcs=False): """Loads the named dataset into a `tf.data.Dataset`. If `split=None` (the default), returns all splits for the dataset. Otherwise, returns the specified split. `load` is a convenience method that fetches the `tfds.core.DatasetBuilder` by string name, optionally calls `DatasetBuilder.download_and_prepare` (if `download=True`), and then calls `DatasetBuilder.as_dataset`. This is roughly equivalent to: ``` builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs) if download: builder.download_and_prepare(**download_and_prepare_kwargs) ds = builder.as_dataset( split=split, as_supervised=as_supervised, **as_dataset_kwargs) if with_info: return ds, builder.info return ds ``` If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, you can pass the return value to `tfds.as_numpy`. Callers must pass arguments as keyword arguments. **Warning**: calling this function might potentially trigger the download of hundreds of GiB to disk. Refer to the `download` argument. Args: name: `str`, the registered name of the `DatasetBuilder` (the snake case version of the class name). This can be either `"dataset_name"` or `"dataset_name/config_name"` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `"foo_bar/a=True,b=3"` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to use the `"zoo"` config and pass to the builder keyword arguments `a=True` and `b=3`). split: `tfds.Split` or `str`, which split of the data to load. If None, will return a `dict` with all splits (typically `tfds.Split.TRAIN` and `tfds.Split.TEST`). data_dir: `str` (optional), directory to read/write data. Defaults to "~/tensorflow_datasets". batch_size: `int`, set to > 1 to get batches of examples. Note that variable length features will be 0-padded. If `batch_size=-1`, will return the full dataset as `tf.Tensor`s. in_memory: `bool`, if `True`, loads the dataset in memory which increases iteration speeds. Note that if `True` and the dataset has unknown dimensions, the features will be padded to the maximum size across the dataset. By default (when `None`), will load the dataset in memory if the size is <1GB and all feature dimensions are statically known. download: `bool` (optional), whether to call `tfds.core.DatasetBuilder.download_and_prepare` before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is expected to be in `data_dir`. If `True` and the data is already in `data_dir`, `download_and_prepare` is a no-op. as_supervised: `bool`, if `True`, the returned `tf.data.Dataset` will have a 2-tuple structure `(input, label)` according to `builder.info.supervised_keys`. If `False`, the default, the returned `tf.data.Dataset` will have a dictionary with all the features. with_info: `bool`, if True, tfds.load will return the tuple (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated with the builder. builder_kwargs: `dict` (optional), keyword arguments to be passed to the `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed through by default. download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow to control where to download and extract the cached data. If not set, cache_dir and manual_dir will automatically be deduced from data_dir. as_dataset_kwargs: `dict` (optional), keyword arguments passed to `tfds.core.DatasetBuilder.as_dataset`. `split` will be passed through by default. Example: `{'shuffle_files': True}`. Note that shuffle_files is False by default unless `split == tfds.Split.TRAIN`. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. Returns: ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a `dict<key: tfds.Split, value: tfds.data.Dataset>`. If `batch_size=-1`, these will be full datasets as `tf.Tensor`s. ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load` will return a tuple `(ds, ds_info)` containing dataset information (version, features, splits, num_examples,...). Note that the `ds_info` object documents the entire dataset, regardless of the `split` requested. Split-specific information is available in `ds_info.splits`. """ name, name_builder_kwargs = _dataset_name_and_kwargs_from_name_str(name) name_builder_kwargs.update(builder_kwargs or {}) builder_kwargs = name_builder_kwargs # Set data_dir if try_gcs and gcs_utils.is_dataset_on_gcs(name): data_dir = constants.GCS_DATA_DIR elif data_dir is None: data_dir = constants.DATA_DIR dbuilder = builder(name, data_dir=data_dir, **builder_kwargs) if download: download_and_prepare_kwargs = download_and_prepare_kwargs or {} dbuilder.download_and_prepare(**download_and_prepare_kwargs) if as_dataset_kwargs is None: as_dataset_kwargs = {} as_dataset_kwargs = dict(as_dataset_kwargs) as_dataset_kwargs["split"] = split as_dataset_kwargs["as_supervised"] = as_supervised as_dataset_kwargs["batch_size"] = batch_size as_dataset_kwargs["in_memory"] = in_memory ds = dbuilder.as_dataset(**as_dataset_kwargs) if with_info: return ds, dbuilder.info return ds
def load( name: str, *, split: Optional[Tree[splits_lib.Split]] = None, data_dir: Optional[str] = None, batch_size: Optional[int] = None, shuffle_files: bool = False, download: bool = True, as_supervised: bool = False, decoders: Optional[TreeDict[decode.Decoder]] = None, read_config: Optional[read_config_lib.ReadConfig] = None, with_info: bool = False, builder_kwargs: Optional[Dict[str, Any]] = None, download_and_prepare_kwargs: Optional[Dict[str, Any]] = None, as_dataset_kwargs: Optional[Dict[str, Any]] = None, try_gcs: bool = False, ): # pylint: disable=line-too-long """Loads the named dataset into a `tf.data.Dataset`. `tfds.load` is a convenience method that: 1. Fetch the `tfds.core.DatasetBuilder` by name: ```python builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs) ``` 2. Generate the data (when `download=True`): ```python builder.download_and_prepare(**download_and_prepare_kwargs) ``` 3. Load the `tf.data.Dataset` object: ```python ds = builder.as_dataset( split=split, as_supervised=as_supervised, shuffle_files=shuffle_files, read_config=read_config, decoders=decoders, **as_dataset_kwargs, ) ``` See: https://www.tensorflow.org/datasets/overview#load_a_dataset for more examples. If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, you can pass the return value to `tfds.as_numpy`. **Warning**: calling this function might potentially trigger the download of hundreds of GiB to disk. Refer to the `download` argument. Args: name: `str`, the registered name of the `DatasetBuilder` (the snake case version of the class name). This can be either `"dataset_name"` or `"dataset_name/config_name"` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `"foo_bar/a=True,b=3"` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to use the `"zoo"` config and pass to the builder keyword arguments `a=True` and `b=3`). split: Which split of the data to load (e.g. `'train'`, `'test'` `['train', 'test']`, `'train[80%:]'`,...). See our [split API guide](https://www.tensorflow.org/datasets/splits). If `None`, will return all splits in a `Dict[Split, tf.data.Dataset]` data_dir: `str`, directory to read/write data. Defaults to the value of the environment variable TFDS_DATA_DIR, if set, otherwise falls back to "~/tensorflow_datasets". batch_size: `int`, if set, add a batch dimension to examples. Note that variable length features will be 0-padded. If `batch_size=-1`, will return the full dataset as `tf.Tensor`s. shuffle_files: `bool`, whether to shuffle the input files. Defaults to `False`. download: `bool` (optional), whether to call `tfds.core.DatasetBuilder.download_and_prepare` before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is expected to be in `data_dir`. If `True` and the data is already in `data_dir`, `download_and_prepare` is a no-op. as_supervised: `bool`, if `True`, the returned `tf.data.Dataset` will have a 2-tuple structure `(input, label)` according to `builder.info.supervised_keys`. If `False`, the default, the returned `tf.data.Dataset` will have a dictionary with all the features. decoders: Nested dict of `Decoder` objects which allow to customize the decoding. The structure should match the feature structure, but only customized feature keys need to be present. See [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md) for more info. read_config: `tfds.ReadConfig`, Additional options to configure the input pipeline (e.g. seed, num parallel reads,...). with_info: `bool`, if True, tfds.load will return the tuple (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated with the builder. builder_kwargs: `dict` (optional), keyword arguments to be passed to the `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed through by default. download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow to control where to download and extract the cached data. If not set, cache_dir and manual_dir will automatically be deduced from data_dir. as_dataset_kwargs: `dict` (optional), keyword arguments passed to `tfds.core.DatasetBuilder.as_dataset`. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. Returns: ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a `dict<key: tfds.Split, value: tf.data.Dataset>`. If `batch_size=-1`, these will be full datasets as `tf.Tensor`s. ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load` will return a tuple `(ds, ds_info)` containing dataset information (version, features, splits, num_examples,...). Note that the `ds_info` object documents the entire dataset, regardless of the `split` requested. Split-specific information is available in `ds_info.splits`. """ # pylint: enable=line-too-long if builder_kwargs is None: builder_kwargs = {} # Set data_dir if try_gcs and gcs_utils.is_dataset_on_gcs(name): data_dir = gcs_utils.gcs_path("datasets") dbuilder = builder(name, data_dir=data_dir, **builder_kwargs) if download: download_and_prepare_kwargs = download_and_prepare_kwargs or {} dbuilder.download_and_prepare(**download_and_prepare_kwargs) if as_dataset_kwargs is None: as_dataset_kwargs = {} as_dataset_kwargs = dict(as_dataset_kwargs) as_dataset_kwargs.setdefault("split", split) as_dataset_kwargs.setdefault("as_supervised", as_supervised) as_dataset_kwargs.setdefault("batch_size", batch_size) as_dataset_kwargs.setdefault("decoders", decoders) as_dataset_kwargs.setdefault("shuffle_files", shuffle_files) as_dataset_kwargs.setdefault("read_config", read_config) ds = dbuilder.as_dataset(**as_dataset_kwargs) if with_info: return ds, dbuilder.info return ds
def builder(name: str, *, try_gcs: bool = False, **builder_kwargs: Any) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_kwargs: `dict` of keyword arguments passed to the `tfds.core.DatasetBuilder`. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'}) ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) # `try_gcs` currently only support non-community datasets if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = builder_kwargs.get('data_dir') if data_dir: raise ValueError( f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` ' 'explicitly set') builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets') # Community datasets if ns_name: raise NotImplementedError # First check whether code exists or not (imported datasets) try: cls = builder_cls(builder_name) except registered.DatasetNotFoundError as e: cls = None # Class not found not_found_error = e # Save the exception to eventually reraise # Eventually try loading from files first if _try_load_from_files_first(cls, **builder_kwargs): try: b = read_only_builder.builder_from_files(builder_name, **builder_kwargs) return b except registered.DatasetNotFoundError as e: pass # If code exists and loading from files was skipped (e.g. files not found), # load from the source code. if cls: with py_utils.try_reraise( prefix=f'Failed to construct dataset {name}: '): return cls(**builder_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error
def builder( name: str, *, data_dir: Optional[str] = None, try_gcs: bool = False, **builder_init_kwargs: Any ) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). data_dir: Path to the dataset(s). See `tfds.load` for more information. try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_init_kwargs: `dict` of keyword arguments passed to the `DatasetBuilder`. These will override keyword arguments passed in `name`, if any. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ builder_name, builder_kwargs = dataset_name_and_kwargs_from_name_str(name) # Set data_dir. if try_gcs and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = gcs_utils.gcs_path("datasets") # Try loading the code (if it exists) try: cls = builder_cls(builder_name) except DatasetNotFoundError as e: if e.is_abstract: raise # Abstract can't be instanciated neither from code nor files. cls = None # Class not found not_found_error = e # Save the exception to eventually reraise version_explicitly_given = "version" in builder_kwargs # Try loading from files first: # * If code not present. # * If version explicitly given (backward/forward compatibility). # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest', # custom config,...), read from generation code. if (not cls or version_explicitly_given) and not builder_init_kwargs: builder_dir = find_builder_dir(name, data_dir=data_dir) if builder_dir is not None: # A generated dataset was found on disk return read_only_builder.builder_from_directory(builder_dir) # If loading from files was skipped (e.g. files not found), load from the # source code. if cls: with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "): return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error