def __init__(self,
                 split: str,
                 validation_percent: float = 0.0,
                 shuffle_buffer_size: Optional[int] = None,
                 num_parallel_parser_calls: int = 64,
                 try_gcs: bool = False,
                 download_data: bool = False,
                 **unused_kwargs: Dict[str, Any]):
        """Create an MNIST tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
    """
        name = 'mnist'
        dataset_builder = tfds.builder(name, try_gcs=try_gcs)
        split = base.get_validation_percent_split(dataset_builder,
                                                  validation_percent, split)
        super(MnistDataset, self).__init__(
            name=name,
            dataset_builder=dataset_builder,
            split=split,
            shuffle_buffer_size=shuffle_buffer_size,
            num_parallel_parser_calls=num_parallel_parser_calls,
            download_data=download_data)
Exemple #2
0
  def __init__(self,
               split: str,
               validation_percent: float = 0.0,
               shuffle_buffer_size: Optional[int] = None,
               num_parallel_parser_calls: int = 64,
               drop_remainder: bool = True,
               try_gcs: bool = False,
               download_data: bool = False,
               data_dir: Optional[str] = None,
               normalize_by_cifar: bool = False,
               is_training: Optional[bool] = None):
    """Create an SVHN tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      drop_remainder: whether or not to drop the last batch of data if the
        number of points is not exactly equal to the batch size. This option
        needs to be True for running on TPUs.
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      data_dir: Directory to read/write data, that is passed to the
              tfds dataset_builder as a data_dir parameter.
      normalize_by_cifar: whether or not to normalize each image by the CIFAR
        dataset mean and stddev.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
    """
    self._normalize_by_cifar = normalize_by_cifar
    name = 'svhn_cropped'
    dataset_builder = tfds.builder(name, try_gcs=try_gcs, data_dir=data_dir)
    if is_training is None:
      is_training = split in ['train', tfds.Split.TRAIN]
    new_split = base.get_validation_percent_split(
        dataset_builder, validation_percent, split)
    super().__init__(
        name=name,
        dataset_builder=dataset_builder,
        split=new_split,
        is_training=is_training,
        shuffle_buffer_size=shuffle_buffer_size,
        num_parallel_parser_calls=num_parallel_parser_calls,
        drop_remainder=drop_remainder,
        download_data=download_data)
Exemple #3
0
  def __init__(
      self,
      name: str,
      fingerprint_key: str,
      split: str,
      validation_percent: float = 0.0,
      shuffle_buffer_size: int = None,
      num_parallel_parser_calls: int = 64,
      drop_remainder: bool = True,
      normalize: bool = True,
      try_gcs: bool = False,
      download_data: bool = False,
      **unused_kwargs: Dict[str, Any]):
    """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder.

    Args:
      name: the name of this dataset, either 'cifar10' or 'cifar100'.
      fingerprint_key: The name of the feature holding a string that will be
        used to create an element id using a fingerprinting function. If None,
        then `ds.enumerate()` is added before the `ds.map(preprocessing_fn)` is
        called and an `id` field is added to the example Dict.
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      drop_remainder: whether or not to drop the last batch of data if the
        number of points is not exactly equal to the batch size. This option
        needs to be True for running on TPUs.
      normalize: whether or not to normalize each image by the CIFAR dataset
        mean and stddev.
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
    """
    self._normalize = normalize
    dataset_builder = tfds.builder(name, try_gcs=try_gcs)
    split = base.get_validation_percent_split(
        dataset_builder, validation_percent, split)
    super(_CifarDataset, self).__init__(
        name=name,
        dataset_builder=dataset_builder,
        split=split,
        shuffle_buffer_size=shuffle_buffer_size,
        num_parallel_parser_calls=num_parallel_parser_calls,
        drop_remainder=drop_remainder,
        fingerprint_key=fingerprint_key,
        download_data=download_data)
    def __init__(self,
                 split: str,
                 seed: Optional[Union[int, tf.Tensor]] = None,
                 validation_percent: float = 0.0,
                 shuffle_buffer_size: Optional[int] = None,
                 num_parallel_parser_calls: int = 64,
                 try_gcs: bool = False,
                 download_data: bool = False,
                 data_dir: Optional[str] = None,
                 is_training: Optional[bool] = None):
        """Create an Places-365 tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      seed: the seed used as a source of randomness.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      data_dir: Directory to read/write data, that is passed to the
              tfds dataset_builder as a data_dir parameter.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
    """
        name = 'places365_small'
        dataset_builder = tfds.builder(name,
                                       try_gcs=try_gcs,
                                       data_dir=data_dir)
        if is_training is None:
            is_training = split in ['train', tfds.Split.TRAIN]
        new_split = base.get_validation_percent_split(
            dataset_builder,
            validation_percent,
            split,
            test_split=tfds.Split.VALIDATION)
        super().__init__(name=name,
                         dataset_builder=dataset_builder,
                         split=new_split,
                         seed=seed,
                         is_training=is_training,
                         shuffle_buffer_size=shuffle_buffer_size,
                         num_parallel_parser_calls=num_parallel_parser_calls,
                         download_data=download_data)
  def __init__(self,
               split: Union[float, str],
               seed: Optional[Union[int, tf.Tensor]] = None,
               validation_percent: float = 0.0,
               shuffle_buffer_size: Optional[int] = None,
               num_parallel_parser_calls: int = 64,
               data_dir: Optional[str] = None,
               is_training: Optional[bool] = None):
    """Create a Criteo tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names. For Criteo it can also be a float to represent the level of data
        augmentation.
      seed: the seed used as a source of randomness.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      data_dir: Path to a directory containing the Criteo datasets, with
        filenames train-*-of-*', 'validate.tfr', 'test.tfr'.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
    """
    # If receive a corruption level as a split, load the test set and save the
    # corruption level for use in preprocessing.
    if isinstance(split, float):
      self._corruption_level = split
      split = 'test'
    else:
      self._corruption_level = None
    dataset_builder = _CriteoDatasetBuilder(data_dir=data_dir)
    if is_training is None:
      is_training = split in ['train', tfds.Split.TRAIN]
    new_split = base.get_validation_percent_split(dataset_builder,
                                                  validation_percent, split)
    super().__init__(
        name='criteo',
        dataset_builder=dataset_builder,
        split=new_split,
        seed=seed,
        is_training=is_training,
        shuffle_buffer_size=shuffle_buffer_size,
        num_parallel_parser_calls=num_parallel_parser_calls,
        download_data=False)
  def __init__(
      self,
      split: str,
      validation_percent: float = 0.0,
      shuffle_buffer_size: Optional[int] = None,
      num_parallel_parser_calls: int = 64,
      try_gcs: bool = False,
      download_data: bool = False,
      normalize_by_cifar: bool = False,
      is_training: Optional[bool] = None,
      **unused_kwargs: Dict[str, Any]):
    """Create an SVHN tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      normalize_by_cifar: whether or not to normalize each image by the CIFAR
        dataset mean and stddev.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
    """
    self._normalize_by_cifar = normalize_by_cifar
    name = 'svhn_cropped'
    dataset_builder = tfds.builder(name, try_gcs=try_gcs)
    if is_training is None:
      is_training = split in ['train', tfds.Split.TRAIN]
    new_split = base.get_validation_percent_split(
        dataset_builder, validation_percent, split)
    super(SvhnDataset, self).__init__(
        name=name,
        dataset_builder=dataset_builder,
        split=new_split,
        is_training=is_training,
        shuffle_buffer_size=shuffle_buffer_size,
        num_parallel_parser_calls=num_parallel_parser_calls,
        download_data=download_data)
Exemple #7
0
    def __init__(self,
                 name: str,
                 split: str,
                 validation_percent: float = 0.0,
                 shuffle_buffer_size: int = None,
                 num_parallel_parser_calls: int = 64,
                 normalize: bool = True,
                 try_gcs: bool = False,
                 download_data: bool = False,
                 **unused_kwargs: Dict[str, Any]):
        """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder.

    Args:
      name: the name of this dataset, either 'cifar10' or 'cifar100'.
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      normalize: whether or not to normalize each image by the CIFAR dataset
        mean and stddev.
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
    """
        self._normalize = normalize
        dataset_builder = tfds.builder(name, try_gcs=try_gcs)
        split = base.get_validation_percent_split(dataset_builder,
                                                  validation_percent, split)
        super(_CifarDataset, self).__init__(
            name=name,
            dataset_builder=dataset_builder,
            split=split,
            shuffle_buffer_size=shuffle_buffer_size,
            num_parallel_parser_calls=num_parallel_parser_calls,
            fingerprint_key='id',
            download_data=download_data)
    def __init__(self,
                 name: str,
                 fingerprint_key: str,
                 split: str,
                 seed: Optional[Union[int, tf.Tensor]] = None,
                 validation_percent: float = 0.0,
                 shuffle_buffer_size: int = None,
                 num_parallel_parser_calls: int = 64,
                 drop_remainder: bool = True,
                 normalize: bool = True,
                 try_gcs: bool = False,
                 download_data: bool = False,
                 use_bfloat16: bool = False,
                 aug_params: Dict[str, Any] = None,
                 data_dir: str = None,
                 is_training: Optional[bool] = None,
                 **unused_kwargs: Dict[str, Any]):
        """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder.

    Args:
      name: the name of this dataset, either 'cifar10' or 'cifar100'.
      fingerprint_key: The name of the feature holding a string that will be
        used to create an element id using a fingerprinting function. If None,
        then `ds.enumerate()` is added before the `ds.map(preprocessing_fn)` is
        called and an `id` field is added to the example Dict.
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      seed: the seed used as a source of randomness.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      drop_remainder: whether or not to drop the last batch of data if the
        number of points is not exactly equal to the batch size. This option
        needs to be True for running on TPUs.
      normalize: whether or not to normalize each image by the CIFAR dataset
        mean and stddev.
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      use_bfloat16: Whether or not to load the data in bfloat16 or float32.
      aug_params: hyperparameters for the data augmentation pre-processing.
      data_dir: Directory to read/write data, that is passed to the
        tfds dataset_builder as a data_dir parameter.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
    """
        self._normalize = normalize
        dataset_builder = tfds.builder(name,
                                       try_gcs=try_gcs,
                                       data_dir=data_dir)
        if is_training is None:
            is_training = split in ['train', tfds.Split.TRAIN]
        new_split = base.get_validation_percent_split(dataset_builder,
                                                      validation_percent,
                                                      split)
        super(_CifarDataset, self).__init__(
            name=name,
            dataset_builder=dataset_builder,
            split=new_split,
            seed=seed,
            is_training=is_training,
            shuffle_buffer_size=shuffle_buffer_size,
            num_parallel_parser_calls=num_parallel_parser_calls,
            drop_remainder=drop_remainder,
            fingerprint_key=fingerprint_key,
            download_data=download_data,
            cache=True)

        self._use_bfloat16 = use_bfloat16
        if aug_params is None:
            aug_params = {}
        self._adaptive_mixup = aug_params.get('adaptive_mixup', False)
        ensemble_size = aug_params.get('ensemble_size', 1)
        if self._adaptive_mixup and 'mixup_coeff' not in aug_params:
            # Hard target in the first epoch!
            aug_params['mixup_coeff'] = tf.ones([ensemble_size, 10])
        self._aug_params = aug_params
    def __init__(
        self,
        split: str,
        validation_percent: float = 0.0,
        shuffle_buffer_size: Optional[int] = 1,
        num_parallel_parser_calls: int = 1,
        try_gcs: bool = False,
        download_data: bool = False,
        data_dir: Optional[str] = None,
        is_training: Optional[bool] = None,
        use_bfloat16: bool = False,
        normalize_input: bool = False,
        image_height: int = 1024,
        image_width: int = 2048,
        one_hot: bool = False,
        include_file_name: bool = False,
    ):
        """Create an Cityscapes tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      data_dir: Directory to read/write data, that is passed to the
        tfds dataset_builder as a data_dir parameter.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
      use_bfloat16: Whether or not to use bfloat16 or float32 images.
      normalize_input: Whether or not to normalize images by the ImageNet mean
        and stddev.
      image_height: The height of the image in pixels.
      image_width: The height of the image in pixels.
      one_hot: whether or not to use one-hot labels.
      include_file_name: Whether or not to include a string file_name field in
        each example. Since this field is a string, it is not compatible with
        TPUs.
    """
        name = 'cityscapes'
        dataset_builder = tfds.builder(name,
                                       try_gcs=try_gcs,
                                       data_dir=data_dir)
        if is_training is None:
            is_training = split in ['train', tfds.Split.TRAIN]
        new_split = base.get_validation_percent_split(
            dataset_builder,
            validation_percent,
            split,
            test_split=tfds.Split.VALIDATION)

        super().__init__(name=name,
                         dataset_builder=dataset_builder,
                         split=new_split,
                         is_training=is_training,
                         shuffle_buffer_size=shuffle_buffer_size,
                         num_parallel_parser_calls=num_parallel_parser_calls,
                         download_data=download_data)

        self._use_bfloat16 = use_bfloat16
        self._normalize_input = normalize_input
        self._image_height = image_height
        self._image_width = image_width
        self._one_hot = one_hot
        self._include_file_name = include_file_name
    def __init__(self,
                 split: str,
                 seed: Optional[Union[int, tf.Tensor]] = None,
                 validation_percent: float = 0.0,
                 shuffle_buffer_size: Optional[int] = 16384,
                 num_parallel_parser_calls: int = 64,
                 try_gcs: bool = False,
                 download_data: bool = False,
                 is_training: Optional[bool] = None,
                 preprocessing_type: str = 'resnet',
                 use_bfloat16: bool = False,
                 normalize_input: bool = False,
                 image_size: int = 224,
                 resnet_preprocessing_resize_method: Optional[str] = None,
                 ensemble_size: int = 1,
                 one_hot: bool = False,
                 mixup_params: Dict[str, Any] = None,
                 run_mixup: bool = False,
                 **unused_kwargs: Dict[str, Any]):
        """Create an ImageNet tf.data.Dataset builder.

    Args:
      split: a dataset split, either a custom tfds.Split or one of the
        tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string
        names.
      seed: the seed used as a source of randomness.
      validation_percent: the percent of the training set to use as a validation
        set.
      shuffle_buffer_size: the number of example to use in the shuffle buffer
        for tf.data.Dataset.shuffle().
      num_parallel_parser_calls: the number of parallel threads to use while
        preprocessing in tf.data.Dataset.map().
      try_gcs: Whether or not to try to use the GCS stored versions of dataset
        files.
      download_data: Whether or not to download data before loading.
      is_training: Whether or not the given `split` is the training split. Only
        required when the passed split is not one of ['train', 'validation',
        'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST].
      preprocessing_type: Which type of preprocessing to apply, either
        'inception' or 'resnet'.
      use_bfloat16: Whether or not to use bfloat16 or float32 images.
      normalize_input: Whether or not to normalize images by the ImageNet mean
        and stddev.
      image_size: The size of the image in pixels.
      resnet_preprocessing_resize_method: Optional string for the resize method
        to use for resnet preprocessing.
      ensemble_size: `int` for number of ensemble members used in Mixup.
      one_hot: whether or not to use one-hot labels.
      mixup_params: hparams of mixup.
      run_mixup: An explicit flag of whether or not to run mixup if
        `mixup_params['mixup_alpha'] > 0`. By default, mixup will only be run in
        training mode if `mixup_params['mixup_alpha'] > 0`.
      **unused_kwargs: Ignored.
    """
        name = 'imagenet2012'
        dataset_builder = tfds.builder(name, try_gcs=try_gcs)
        if is_training is None:
            is_training = split in ['train', tfds.Split.TRAIN]
        new_split = base.get_validation_percent_split(
            dataset_builder,
            validation_percent,
            split,
            test_split=tfds.Split.VALIDATION)
        if preprocessing_type == 'inception':
            decoders = {
                'image': tfds.decode.SkipDecoding(),
            }
        else:
            decoders = None
        super(ImageNetDataset, self).__init__(
            name=name,
            dataset_builder=dataset_builder,
            split=new_split,
            is_training=is_training,
            shuffle_buffer_size=shuffle_buffer_size,
            num_parallel_parser_calls=num_parallel_parser_calls,
            fingerprint_key='file_name',
            download_data=download_data,
            decoders=decoders)
        self._preprocessing_type = preprocessing_type
        self._use_bfloat16 = use_bfloat16
        self._normalize_input = normalize_input
        self._image_size = image_size
        self._resnet_preprocessing_resize_method = resnet_preprocessing_resize_method
        self._run_mixup = run_mixup

        self.ensemble_size = ensemble_size
        self._one_hot = one_hot
        if mixup_params is None:
            mixup_params = {}
        self._mixup_params = mixup_params