Ejemplo n.º 1
0
 def test_experiment_override(self):
     v = version.Version('1.2.3',
                         experiments={version.Experiment.DUMMY: True})
     self.assertTrue(v.implements(version.Experiment.DUMMY))
Ejemplo n.º 2
0
 def test_experiment_default(self):
     v = version.Version('1.2.3')
     self.assertFalse(v.implements(version.Experiment.DUMMY))
Ejemplo n.º 3
0
    def test_version(self):
        """Test the zip nested function."""

        self.assertEqual(version.Version(), version.Version(0, 0, 0))
        self.assertEqual(version.Version('1.3.534'),
                         version.Version(1, 3, 534))
        self.assertEqual(version.Version(major=1, minor=3, patch=5),
                         version.Version(1, 3, 5))

        self.assertEqual(version.Version('latest'), version.Version.LATEST)
        self.assertEqual(version.Version(version.Version('1.3.5')),
                         version.Version(1, 3, 5))

        self.assertEqual(str(version.Version(10, 2, 3)), '10.2.3')
        self.assertEqual(str(version.Version()), '0.0.0')

        with self.assertRaisesWithPredicateMatch(ValueError,
                                                 'Format should be '):
            version.Version('1.3.-534')
        with self.assertRaisesWithPredicateMatch(ValueError,
                                                 'Format should be '):
            version.Version('1.3')
        with self.assertRaisesWithPredicateMatch(ValueError,
                                                 'Format should be '):
            version.Version('1.3.')
        with self.assertRaisesWithPredicateMatch(ValueError,
                                                 'Format should be '):
            version.Version('1..5')
        with self.assertRaisesWithPredicateMatch(ValueError,
                                                 'Format should be '):
            version.Version('a.b.c')
Ejemplo n.º 4
0
class ImageFolder(dataset_builder.DatasetBuilder):
    """Generic image classification dataset created from manual directory.

  `ImageFolder` creates a `tf.data.Dataset` reading the original image files.

  The data directory should have the following structure:

  ```
  path/to/image_dir/
    split_name/  # Ex: 'train'
      label1/  # Ex: 'airplane' or '0015'
        xxx.png
        xxy.png
        xxz.png
      label2/
        xxx.png
        xxy.png
        xxz.png
    split_name/  # Ex: 'test'
      ...
  ```

  To use it:

  ```
  builder = tfds.ImageFolder('path/to/image_dir/')
  print(builder.info)  # num examples, labels... are automatically calculated
  ds = builder.as_dataset(split='train', shuffle_files=True)
  tfds.show_examples(ds, builder.info)
  ```

  """

    VERSION = version.Version('1.0.0')

    def __init__(self, root_dir: str):
        super(ImageFolder, self).__init__()
        self._data_dir = root_dir  # Set data_dir to the existing dir.

        # Extract the splits, examples, labels
        root_dir = os.path.expanduser(root_dir)
        self._split_examples, labels = _get_split_label_images(root_dir)

        # Update DatasetInfo labels
        self.info.features['label'].names = sorted(labels)

        # Update DatasetInfo splits
        split_dict = split_lib.SplitDict(self.name)
        for split_name, examples in self._split_examples.items():
            split_dict.add(
                split_lib.SplitInfo(
                    name=split_name,
                    shard_lengths=[len(examples)],
                ))
        self.info.update_splits_if_different(split_dict)

    def _info(self) -> dataset_info.DatasetInfo:
        return dataset_info.DatasetInfo(
            builder=self,
            description='Generic image classification dataset.',
            features=features_lib.FeaturesDict({
                'image':
                features_lib.Image(),
                'label':
                features_lib.ClassLabel(),
                'image/filename':
                features_lib.Text(),
            }),
            supervised_keys=('image', 'label'),
        )

    # TODO(tfds): Should restore `-> NoReturn` annotatation for Python 3.6.2+
    def _download_and_prepare(self, **kwargs):  # -> NoReturn:
        raise NotImplementedError(
            'No need to call download_and_prepare function for {}.'.format(
                type(self).__name__))

    def download_and_prepare(self, **kwargs):  # -> NoReturn:
        return self._download_and_prepare()

    def _as_dataset(self,
                    split,
                    shuffle_files=False,
                    decoders=None,
                    read_config=None) -> tf.data.Dataset:
        """Generate dataset for given split."""
        del read_config  # Unused (automatically created in `DatasetBuilder`)
        if decoders:
            raise NotImplementedError(
                '`decoders` is not supported with {}'.format(
                    type(self).__name__))
        if split not in self.info.splits.keys():
            raise ValueError(
                'Unrecognized split {}. Subsplit API not yet supported for {}. '
                'Split name should be one of {}.'.format(
                    split,
                    type(self).__name__, list(self.info.splits.keys())))

        # Extract all labels/images
        image_paths = []
        labels = []
        examples = self._split_examples[split]
        for example in examples:
            image_paths.append(example.image_path)
            labels.append(self.info.features['label'].str2int(example.label))

        # Build the tf.data.Dataset object
        ds = tf.data.Dataset.from_tensor_slices((image_paths, labels))
        if shuffle_files:
            ds = ds.shuffle(len(examples))
        ds = ds.map(_load_example,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
        return ds
Ejemplo n.º 5
0
class TranslateFolder(dataset_builder.DatasetBuilder):
    """Generic text translation dataset created from manual directory.

  The directory content should be as followed:

  ```
  path/to/my_data/
    lang1.train.txt
    lang2.train.txt
    lang1.test.txt
    lang2.test.txt
    ...
  ```

  Each files should have one example per line. Line order should match between
  files.

  To use it:

  ```
  builder = tfds.TranslateFolder(root_dir='path/to/my_data/')
  print(builder.info)  # Splits, num examples,... are automatically calculated
  ds = builder.as_dataset(split='train', shuffle_files=True)
  ```

  Note: All examples from all splits are loaded in memory in `__init__`.

  """

    VERSION = version.Version('1.0.0')

    def __init__(self, root_dir: str):
        # Extract the splits, examples
        root_dir = os.path.expanduser(root_dir)
        self._split_examples, self._languages = _get_split_language_examples(
            root_dir)

        super(TranslateFolder, self).__init__()
        # Reset `_data_dir` as it should not change to DATA_DIR/Version
        self._data_dir = root_dir

        # Update DatasetInfo splits
        split_infos = [
            split_lib.SplitInfo(  # pylint: disable=g-complex-comprehension
                name=split_name,
                shard_lengths=[len(next(iter(examples.values())))],
                num_bytes=0,
            ) for split_name, examples in self._split_examples.items()
        ]
        split_dict = split_lib.SplitDict(split_infos, dataset_name=self.name)
        self.info.set_splits(split_dict)

    def _info(self) -> dataset_info.DatasetInfo:
        return dataset_info.DatasetInfo(
            builder=self,
            description='Generic text translation dataset.',
            features=features_lib.FeaturesDict(
                {lang: features_lib.Text()
                 for lang in self._languages}),
        )

    def _download_and_prepare(self, **kwargs):
        raise NotImplementedError(
            'No need to call download_and_prepare function for {}.'.format(
                type(self).__name__))

    def download_and_prepare(self, **kwargs):
        return self._download_and_prepare()

    def _as_dataset(self,
                    split,
                    shuffle_files=False,
                    decoders=None,
                    read_config=None) -> tf.data.Dataset:
        """Generate dataset for given split."""
        del read_config  # Unused (automatically created in `DatasetBuilder`)
        if decoders:
            raise NotImplementedError(
                '`decoders` is not supported with {}'.format(
                    type(self).__name__))
        if split not in self.info.splits.keys():
            raise ValueError(
                'Unrecognized split {}. Subsplit API not yet supported for {}. '
                'Split name should be one of {}.'.format(
                    split,
                    type(self).__name__, list(self.info.splits.keys())))

        # Build the tf.data.Dataset object
        lang_example_dict = self._split_examples[split]
        ds = tf.data.Dataset.from_tensor_slices(lang_example_dict)
        if shuffle_files:
            ds = ds.shuffle(len(lang_example_dict))
        return ds
Ejemplo n.º 6
0
class ImageFolder(dataset_builder.DatasetBuilder):
  """Generic image classification dataset created from manual directory.

  `ImageFolder` creates a `tf.data.Dataset` reading the original image files.

  The data directory should have the following structure:

  ```
  path/to/image_dir/
    split_name/  # Ex: 'train'
      label1/  # Ex: 'airplane' or '0015'
        xxx.png
        xxy.png
        xxz.png
      label2/
        xxx.png
        xxy.png
        xxz.png
    split_name/  # Ex: 'test'
      ...
  ```

  To use it:

  ```
  builder = tfds.ImageFolder('path/to/image_dir/')
  print(builder.info)  # num examples, labels... are automatically calculated
  ds = builder.as_dataset(split='train', shuffle_files=True)
  tfds.show_examples(ds, builder.info)
  ```

  """

  VERSION = version.Version('1.0.0')

  def __init__(
      self,
      root_dir: str,
      *,
      shape: Optional[type_utils.Shape] = None,
      dtype: Optional[tf.DType] = None,
  ):
    """Construct the `DatasetBuilder`.

    Args:
      root_dir: Path to the directory containing the images.
      shape: Image shape forwarded to `tfds.features.Image`.
      dtype: Image dtype forwarded to `tfds.features.Image`.
    """
    self._image_shape = shape
    self._image_dtype = dtype
    super(ImageFolder, self).__init__()
    self._data_dir = root_dir  # Set data_dir to the existing dir.

    # Extract the splits, examples, labels
    root_dir = os.path.expanduser(root_dir)
    self._split_examples, labels = _get_split_label_images(root_dir)

    # Update DatasetInfo labels
    self.info.features['label'].names = sorted(labels)

    # Update DatasetInfo splits
    split_infos = [
        split_lib.SplitInfo(  # pylint: disable=g-complex-comprehension
            name=split_name,
            shard_lengths=[len(examples)],
            num_bytes=0,
        ) for split_name, examples in self._split_examples.items()
    ]
    split_dict = split_lib.SplitDict(split_infos, dataset_name=self.name)
    self.info.set_splits(split_dict)

  def _info(self) -> dataset_info.DatasetInfo:
    return dataset_info.DatasetInfo(
        builder=self,
        description='Generic image classification dataset.',
        features=features_lib.FeaturesDict({
            'image':
                features_lib.Image(
                    shape=self._image_shape,
                    dtype=self._image_dtype,
                ),
            'label':
                features_lib.ClassLabel(),
            'image/filename':
                features_lib.Text(),
        }),
        supervised_keys=('image', 'label'),
    )

  def _download_and_prepare(self, **kwargs) -> NoReturn:
    raise NotImplementedError(
        'No need to call download_and_prepare function for {}.'.format(
            type(self).__name__))

  def download_and_prepare(self, **kwargs):  # -> NoReturn:
    return self._download_and_prepare()

  def _as_dataset(self,
                  split: str,
                  shuffle_files: bool = False,
                  decoders: Optional[Dict[str, decode.Decoder]] = None,
                  read_config=None) -> tf.data.Dataset:
    """Generate dataset for given split."""
    del read_config  # Unused (automatically created in `DatasetBuilder`)

    if split not in self.info.splits.keys():
      raise ValueError(
          'Unrecognized split {}. Subsplit API not yet supported for {}. '
          'Split name should be one of {}.'.format(
              split,
              type(self).__name__, list(self.info.splits.keys())))

    # Extract all labels/images
    image_paths = []
    labels = []
    examples = self._split_examples[split]
    for example in examples:
      image_paths.append(example.image_path)
      labels.append(self.info.features['label'].str2int(example.label))

    # Build the tf.data.Dataset object
    ds = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    if shuffle_files:
      ds = ds.shuffle(len(examples))

    # Fuse load and decode into one function
    def _load_and_decode_fn(*args, **kwargs):
      ex = _load_example(*args, **kwargs)
      return self.info.features.decode_example(ex, decoders=decoders)

    ds = ds.map(
        _load_and_decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return ds