Ejemplo n.º 1
0
    def _build_single_dataset(self, split, shuffle_files, batch_size,
                              as_supervised):
        """as_dataset for a single split."""
        if isinstance(split, six.string_types):
            split = splits_lib.Split(split)

        if shuffle_files is None:
            # Shuffle files if training
            shuffle_files = split == splits_lib.Split.TRAIN

        wants_full_dataset = batch_size == -1
        if wants_full_dataset:
            batch_size = self.info.splits.total_num_examples or sys.maxsize

        dataset = self._as_dataset(split=split, shuffle_files=shuffle_files)
        if batch_size > 1:
            # Use padded_batch so that features with unknown shape are supported.
            padded_shapes = self.info.features.shape
            dataset = dataset.padded_batch(batch_size, padded_shapes)

        if as_supervised:
            if not self.info.supervised_keys:
                raise ValueError(
                    "as_supervised=True but %s does not support a supervised "
                    "(input, label) structure." % self.name)
            input_f, target_f = self.info.supervised_keys
            dataset = dataset.map(
                lambda fs: (fs[input_f], fs[target_f]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        # If shuffling, allow pipeline to be non-deterministic
        options = tf.data.Options()
        options.experimental_deterministic = not shuffle_files
        dataset = dataset.with_options(options)

        if wants_full_dataset:
            return tf.data.experimental.get_single_element(dataset)
        else:
            return dataset
Ejemplo n.º 2
0
    def _build_single_dataset(self, split, shuffle_files, batch_size, decoders,
                              as_supervised, in_memory):
        """as_dataset for a single split."""
        if isinstance(split, six.string_types):
            split = splits_lib.Split(split)

        wants_full_dataset = batch_size == -1
        if wants_full_dataset:
            batch_size = self.info.splits.total_num_examples or sys.maxsize

        # If the dataset is small, load it in memory
        dataset_shape_is_fully_defined = (
            dataset_utils.features_shape_is_fully_defined(self.info.features))
        in_memory_default = False
        # TODO(tfds): Consider default in_memory=True for small datasets with
        # fully-defined shape.
        # Expose and use the actual data size on disk and rm the manual
        # name guards. size_in_bytes is the download size, which is misleading,
        # particularly for datasets that use manual_dir as well as some downloads
        # (wmt and diabetic_retinopathy_detection).
        # in_memory_default = (
        #     self.info.size_in_bytes and
        #     self.info.size_in_bytes <= 1e9 and
        #     not self.name.startswith("wmt") and
        #     not self.name.startswith("diabetic") and
        #     dataset_shape_is_fully_defined)
        in_memory = in_memory_default if in_memory is None else in_memory

        # Build base dataset
        if in_memory and not wants_full_dataset:
            # TODO(tfds): Enable in_memory without padding features. May be able
            # to do by using a requested version of tf.data.Dataset.cache that can
            # persist a cache beyond iterator instances.
            if not dataset_shape_is_fully_defined:
                logging.warning(
                    "Called in_memory=True on a dataset that does not "
                    "have fully defined shapes. Note that features with "
                    "variable length dimensions will be 0-padded to "
                    "the maximum length across the dataset.")
            full_bs = self.info.splits.total_num_examples or sys.maxsize
            # If using in_memory, escape all device contexts so we can load the data
            # with a local Session.
            with tf.device(None):
                dataset = self._as_dataset(split=split,
                                           shuffle_files=shuffle_files,
                                           decoders=decoders)
                # Use padded_batch so that features with unknown shape are supported.
                dataset = dataset.padded_batch(
                    full_bs, tf.compat.v1.data.get_output_shapes(dataset))
                dataset = tf.data.Dataset.from_tensor_slices(
                    next(dataset_utils.as_numpy(dataset)))
        else:
            dataset = self._as_dataset(split=split,
                                       shuffle_files=shuffle_files,
                                       decoders=decoders)

        if batch_size:
            # Use padded_batch so that features with unknown shape are supported.
            dataset = dataset.padded_batch(
                batch_size, tf.compat.v1.data.get_output_shapes(dataset))

        if as_supervised:
            if not self.info.supervised_keys:
                raise ValueError(
                    "as_supervised=True but %s does not support a supervised "
                    "(input, label) structure." % self.name)
            input_f, target_f = self.info.supervised_keys
            dataset = dataset.map(
                lambda fs: (fs[input_f], fs[target_f]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        # If shuffling, allow pipeline to be non-deterministic
        options = tf.data.Options()
        options.experimental_deterministic = not shuffle_files
        dataset = dataset.with_options(options)

        if wants_full_dataset:
            return tf.data.experimental.get_single_element(dataset)
        return dataset
Ejemplo n.º 3
0
    def as_dataset(self,
                   split=None,
                   batch_size=1,
                   shuffle_files=None,
                   as_supervised=False):
        """Constructs a `tf.data.Dataset`.

    Callers must pass arguments as keyword arguments.

    Args:
      split: `tfds.core.SplitBase`, which subset of the data to read. If None
        (default), returns all splits in a dict
        `<key: tfds.Split, value: tf.data.Dataset>`.
      batch_size: `int`, batch size. Note that variable-length features will
        be 0-padded if `batch_size > 1`. Users that want more custom behavior
        should use `batch_size=1` and use the `tf.data` API to construct a
        custom pipeline. If `batch_size == -1`, will return feature
        dictionaries of the whole dataset with `tf.Tensor`s instead of a
        `tf.data.Dataset`.
      shuffle_files: `bool`, whether to shuffle the input files.
        Defaults to `True` if `split == tfds.Split.TRAIN` and `False` otherwise.
      as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
        will have a 2-tuple structure `(input, label)` according to
        `builder.info.supervised_keys`. If `False`, the default,
        the returned `tf.data.Dataset` will have a dictionary with all the
        features.

    Returns:
      `tf.data.Dataset`, or if `split=None`, `dict<key: tfds.Split, value:
      tfds.data.Dataset>`.

      If `batch_size` is -1, will return feature dictionaries containing
      the entire dataset in `tf.Tensor`s instead of a `tf.data.Dataset`.
    """
        if not tf.io.gfile.exists(self._data_dir):
            raise AssertionError((
                "Dataset %s: could not find data in %s. Please make sure to call "
                "dataset_builder.download_and_prepare(), or pass download=True to "
                "tfds.load() before trying to access the tf.data.Dataset object."
            ) % (self.name, self._data_dir_root))

        if split is None:
            splits = list(self.info.splits)
            return_dict = True
        else:
            splits = [split]
            return_dict = False

        datasets = []
        for split in splits:
            if isinstance(split, six.string_types):
                split = splits_lib.Split(split)
            split_shuffle = shuffle_files
            if split_shuffle is None:
                # Shuffle files if training
                split_shuffle = split == splits_lib.Split.TRAIN
            dataset = self._build_single_dataset(split=split,
                                                 shuffle_files=split_shuffle,
                                                 batch_size=batch_size,
                                                 as_supervised=as_supervised)
            datasets.append(dataset)

        if return_dict:
            return dict(zip(splits, datasets))
        else:
            assert len(splits) == 1
            return datasets[0]
Ejemplo n.º 4
0
  def _build_single_dataset(
      self,
      split,
      shuffle_files,
      batch_size,
      decoders,
      read_config,
      as_supervised,
  ):
    """as_dataset for a single split."""
    if isinstance(split, six.string_types):
      split = splits_lib.Split(split)

    wants_full_dataset = batch_size == -1
    if wants_full_dataset:
      batch_size = self.info.splits.total_num_examples or sys.maxsize

    # Build base dataset
    ds = self._as_dataset(
        split=split,
        shuffle_files=shuffle_files,
        decoders=decoders,
        read_config=read_config,
    )
    # Auto-cache small datasets which are small enough to fit in memory.
    if self._should_cache_ds(
        split=split,
        shuffle_files=shuffle_files,
        read_config=read_config
    ):
      ds = ds.cache()

    if batch_size:
      # Use padded_batch so that features with unknown shape are supported.
      ds = ds.padded_batch(
          batch_size, tf.compat.v1.data.get_output_shapes(ds))

    if as_supervised:
      if not self.info.supervised_keys:
        raise ValueError(
            "as_supervised=True but %s does not support a supervised "
            "(input, label) structure." % self.name)
      input_f, target_f = self.info.supervised_keys
      ds = ds.map(lambda fs: (fs[input_f], fs[target_f]),
                  num_parallel_calls=tf.data.experimental.AUTOTUNE)

    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    # If shuffling is True and seeds not set, allow pipeline to be
    # non-deterministic
    # This code should probably be moved inside tfreader, such as
    # all the tf.data.Options are centralized in a single place.
    if (shuffle_files and
        read_config.options.experimental_deterministic is None and
        read_config.shuffle_seed is None):
      options = tf.data.Options()
      options.experimental_deterministic = False
      ds = ds.with_options(options)
    # If shuffle is False, keep the default value (deterministic), which
    # allow the user to overwritte it.

    if wants_full_dataset:
      return tf.data.experimental.get_single_element(ds)
    return ds
Ejemplo n.º 5
0
  def _build_single_dataset(
      self,
      split,
      shuffle_files,
      batch_size,
      decoders,
      read_config,
      as_supervised,
      in_memory):
    """as_dataset for a single split."""
    if isinstance(split, six.string_types):
      split = splits_lib.Split(split)

    wants_full_dataset = batch_size == -1
    if wants_full_dataset:
      batch_size = self.info.splits.total_num_examples or sys.maxsize

    # Build base dataset
    if in_memory and not wants_full_dataset:
      # TODO(tfds): Remove once users have been migrated

      # If the dataset is small, load it in memory
      logging.warning(
          "`in_memory` if deprecated and will be removed in a future version. "
          "Please use `ds = ds.cache()` instead.")

      # TODO(tfds): Enable in_memory without padding features. May be able
      # to do by using a requested version of tf.data.Dataset.cache that can
      # persist a cache beyond iterator instances.
      dataset_shape_is_fully_defined = (
          dataset_utils.features_shape_is_fully_defined(self.info.features))
      if not dataset_shape_is_fully_defined:
        logging.warning("Called in_memory=True on a dataset that does not "
                        "have fully defined shapes. Note that features with "
                        "variable length dimensions will be 0-padded to "
                        "the maximum length across the dataset.")
      full_bs = self.info.splits.total_num_examples or sys.maxsize
      # If using in_memory, escape all device contexts so we can load the data
      # with a local Session.
      with tf.device(None):
        ds = self._as_dataset(
            split=split,
            shuffle_files=shuffle_files,
            decoders=decoders,
            read_config=read_config,
        )
        # Use padded_batch so that features with unknown shape are supported.
        ds = ds.padded_batch(
            full_bs, tf.compat.v1.data.get_output_shapes(ds))
        ds = tf.compat.v1.data.Dataset.from_tensor_slices(
            next(dataset_utils.as_numpy(ds)))
    else:
      ds = self._as_dataset(
          split=split,
          shuffle_files=shuffle_files,
          decoders=decoders,
          read_config=read_config,
      )
      # Auto-cache small datasets which are small enough to fit in memory.
      if self._should_cache_ds(
          split=split,
          shuffle_files=shuffle_files,
          read_config=read_config
      ):
        ds = ds.cache()

    if batch_size:
      # Use padded_batch so that features with unknown shape are supported.
      ds = ds.padded_batch(
          batch_size, tf.compat.v1.data.get_output_shapes(ds))

    if as_supervised:
      if not self.info.supervised_keys:
        raise ValueError(
            "as_supervised=True but %s does not support a supervised "
            "(input, label) structure." % self.name)
      input_f, target_f = self.info.supervised_keys
      ds = ds.map(lambda fs: (fs[input_f], fs[target_f]),
                  num_parallel_calls=tf.data.experimental.AUTOTUNE)

    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    # If shuffling is True and seeds not set, allow pipeline to be
    # non-deterministic
    # This code should probably be moved inside tfreader, such as
    # all the tf.data.Options are centralized in a single place.
    if (shuffle_files and
        read_config.options.experimental_deterministic is None and
        read_config.shuffle_seed is None):
      options = tf.data.Options()
      options.experimental_deterministic = False
      ds = ds.with_options(options)
    # If shuffle is False, keep the default value (deterministic), which
    # allow the user to overwritte it.

    if wants_full_dataset:
      return tf.data.experimental.get_single_element(ds)
    return ds