def __init__(self, split: str, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = None, num_parallel_parser_calls: int = 64, try_gcs: bool = False, download_data: bool = False, **unused_kwargs: Dict[str, Any]): """Create an MNIST tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. """ name = 'mnist' dataset_builder = tfds.builder(name, try_gcs=try_gcs) split = base.get_validation_percent_split(dataset_builder, validation_percent, split) super(MnistDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=split, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, download_data=download_data)
def __init__(self, split: str, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = None, num_parallel_parser_calls: int = 64, drop_remainder: bool = True, try_gcs: bool = False, download_data: bool = False, data_dir: Optional[str] = None, normalize_by_cifar: bool = False, is_training: Optional[bool] = None): """Create an SVHN tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). drop_remainder: whether or not to drop the last batch of data if the number of points is not exactly equal to the batch size. This option needs to be True for running on TPUs. try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. data_dir: Directory to read/write data, that is passed to the tfds dataset_builder as a data_dir parameter. normalize_by_cifar: whether or not to normalize each image by the CIFAR dataset mean and stddev. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. """ self._normalize_by_cifar = normalize_by_cifar name = 'svhn_cropped' dataset_builder = tfds.builder(name, try_gcs=try_gcs, data_dir=data_dir) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split( dataset_builder, validation_percent, split) super().__init__( name=name, dataset_builder=dataset_builder, split=new_split, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, drop_remainder=drop_remainder, download_data=download_data)
def __init__( self, name: str, fingerprint_key: str, split: str, validation_percent: float = 0.0, shuffle_buffer_size: int = None, num_parallel_parser_calls: int = 64, drop_remainder: bool = True, normalize: bool = True, try_gcs: bool = False, download_data: bool = False, **unused_kwargs: Dict[str, Any]): """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder. Args: name: the name of this dataset, either 'cifar10' or 'cifar100'. fingerprint_key: The name of the feature holding a string that will be used to create an element id using a fingerprinting function. If None, then `ds.enumerate()` is added before the `ds.map(preprocessing_fn)` is called and an `id` field is added to the example Dict. split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). drop_remainder: whether or not to drop the last batch of data if the number of points is not exactly equal to the batch size. This option needs to be True for running on TPUs. normalize: whether or not to normalize each image by the CIFAR dataset mean and stddev. try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. """ self._normalize = normalize dataset_builder = tfds.builder(name, try_gcs=try_gcs) split = base.get_validation_percent_split( dataset_builder, validation_percent, split) super(_CifarDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=split, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, drop_remainder=drop_remainder, fingerprint_key=fingerprint_key, download_data=download_data)
def __init__(self, split: str, seed: Optional[Union[int, tf.Tensor]] = None, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = None, num_parallel_parser_calls: int = 64, try_gcs: bool = False, download_data: bool = False, data_dir: Optional[str] = None, is_training: Optional[bool] = None): """Create an Places-365 tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. seed: the seed used as a source of randomness. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. data_dir: Directory to read/write data, that is passed to the tfds dataset_builder as a data_dir parameter. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. """ name = 'places365_small' dataset_builder = tfds.builder(name, try_gcs=try_gcs, data_dir=data_dir) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split( dataset_builder, validation_percent, split, test_split=tfds.Split.VALIDATION) super().__init__(name=name, dataset_builder=dataset_builder, split=new_split, seed=seed, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, download_data=download_data)
def __init__(self, split: Union[float, str], seed: Optional[Union[int, tf.Tensor]] = None, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = None, num_parallel_parser_calls: int = 64, data_dir: Optional[str] = None, is_training: Optional[bool] = None): """Create a Criteo tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. For Criteo it can also be a float to represent the level of data augmentation. seed: the seed used as a source of randomness. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). data_dir: Path to a directory containing the Criteo datasets, with filenames train-*-of-*', 'validate.tfr', 'test.tfr'. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. """ # If receive a corruption level as a split, load the test set and save the # corruption level for use in preprocessing. if isinstance(split, float): self._corruption_level = split split = 'test' else: self._corruption_level = None dataset_builder = _CriteoDatasetBuilder(data_dir=data_dir) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split(dataset_builder, validation_percent, split) super().__init__( name='criteo', dataset_builder=dataset_builder, split=new_split, seed=seed, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, download_data=False)
def __init__( self, split: str, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = None, num_parallel_parser_calls: int = 64, try_gcs: bool = False, download_data: bool = False, normalize_by_cifar: bool = False, is_training: Optional[bool] = None, **unused_kwargs: Dict[str, Any]): """Create an SVHN tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. normalize_by_cifar: whether or not to normalize each image by the CIFAR dataset mean and stddev. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. """ self._normalize_by_cifar = normalize_by_cifar name = 'svhn_cropped' dataset_builder = tfds.builder(name, try_gcs=try_gcs) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split( dataset_builder, validation_percent, split) super(SvhnDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=new_split, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, download_data=download_data)
def __init__(self, name: str, split: str, validation_percent: float = 0.0, shuffle_buffer_size: int = None, num_parallel_parser_calls: int = 64, normalize: bool = True, try_gcs: bool = False, download_data: bool = False, **unused_kwargs: Dict[str, Any]): """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder. Args: name: the name of this dataset, either 'cifar10' or 'cifar100'. split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). normalize: whether or not to normalize each image by the CIFAR dataset mean and stddev. try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. """ self._normalize = normalize dataset_builder = tfds.builder(name, try_gcs=try_gcs) split = base.get_validation_percent_split(dataset_builder, validation_percent, split) super(_CifarDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=split, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, fingerprint_key='id', download_data=download_data)
def __init__(self, name: str, fingerprint_key: str, split: str, seed: Optional[Union[int, tf.Tensor]] = None, validation_percent: float = 0.0, shuffle_buffer_size: int = None, num_parallel_parser_calls: int = 64, drop_remainder: bool = True, normalize: bool = True, try_gcs: bool = False, download_data: bool = False, use_bfloat16: bool = False, aug_params: Dict[str, Any] = None, data_dir: str = None, is_training: Optional[bool] = None, **unused_kwargs: Dict[str, Any]): """Create a CIFAR10 or CIFAR100 tf.data.Dataset builder. Args: name: the name of this dataset, either 'cifar10' or 'cifar100'. fingerprint_key: The name of the feature holding a string that will be used to create an element id using a fingerprinting function. If None, then `ds.enumerate()` is added before the `ds.map(preprocessing_fn)` is called and an `id` field is added to the example Dict. split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. seed: the seed used as a source of randomness. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). drop_remainder: whether or not to drop the last batch of data if the number of points is not exactly equal to the batch size. This option needs to be True for running on TPUs. normalize: whether or not to normalize each image by the CIFAR dataset mean and stddev. try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. use_bfloat16: Whether or not to load the data in bfloat16 or float32. aug_params: hyperparameters for the data augmentation pre-processing. data_dir: Directory to read/write data, that is passed to the tfds dataset_builder as a data_dir parameter. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. """ self._normalize = normalize dataset_builder = tfds.builder(name, try_gcs=try_gcs, data_dir=data_dir) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split(dataset_builder, validation_percent, split) super(_CifarDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=new_split, seed=seed, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, drop_remainder=drop_remainder, fingerprint_key=fingerprint_key, download_data=download_data, cache=True) self._use_bfloat16 = use_bfloat16 if aug_params is None: aug_params = {} self._adaptive_mixup = aug_params.get('adaptive_mixup', False) ensemble_size = aug_params.get('ensemble_size', 1) if self._adaptive_mixup and 'mixup_coeff' not in aug_params: # Hard target in the first epoch! aug_params['mixup_coeff'] = tf.ones([ensemble_size, 10]) self._aug_params = aug_params
def __init__( self, split: str, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = 1, num_parallel_parser_calls: int = 1, try_gcs: bool = False, download_data: bool = False, data_dir: Optional[str] = None, is_training: Optional[bool] = None, use_bfloat16: bool = False, normalize_input: bool = False, image_height: int = 1024, image_width: int = 2048, one_hot: bool = False, include_file_name: bool = False, ): """Create an Cityscapes tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. data_dir: Directory to read/write data, that is passed to the tfds dataset_builder as a data_dir parameter. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. use_bfloat16: Whether or not to use bfloat16 or float32 images. normalize_input: Whether or not to normalize images by the ImageNet mean and stddev. image_height: The height of the image in pixels. image_width: The height of the image in pixels. one_hot: whether or not to use one-hot labels. include_file_name: Whether or not to include a string file_name field in each example. Since this field is a string, it is not compatible with TPUs. """ name = 'cityscapes' dataset_builder = tfds.builder(name, try_gcs=try_gcs, data_dir=data_dir) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split( dataset_builder, validation_percent, split, test_split=tfds.Split.VALIDATION) super().__init__(name=name, dataset_builder=dataset_builder, split=new_split, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, download_data=download_data) self._use_bfloat16 = use_bfloat16 self._normalize_input = normalize_input self._image_height = image_height self._image_width = image_width self._one_hot = one_hot self._include_file_name = include_file_name
def __init__(self, split: str, seed: Optional[Union[int, tf.Tensor]] = None, validation_percent: float = 0.0, shuffle_buffer_size: Optional[int] = 16384, num_parallel_parser_calls: int = 64, try_gcs: bool = False, download_data: bool = False, is_training: Optional[bool] = None, preprocessing_type: str = 'resnet', use_bfloat16: bool = False, normalize_input: bool = False, image_size: int = 224, resnet_preprocessing_resize_method: Optional[str] = None, ensemble_size: int = 1, one_hot: bool = False, mixup_params: Dict[str, Any] = None, run_mixup: bool = False, **unused_kwargs: Dict[str, Any]): """Create an ImageNet tf.data.Dataset builder. Args: split: a dataset split, either a custom tfds.Split or one of the tfds.Split enums [TRAIN, VALIDAITON, TEST] or their lowercase string names. seed: the seed used as a source of randomness. validation_percent: the percent of the training set to use as a validation set. shuffle_buffer_size: the number of example to use in the shuffle buffer for tf.data.Dataset.shuffle(). num_parallel_parser_calls: the number of parallel threads to use while preprocessing in tf.data.Dataset.map(). try_gcs: Whether or not to try to use the GCS stored versions of dataset files. download_data: Whether or not to download data before loading. is_training: Whether or not the given `split` is the training split. Only required when the passed split is not one of ['train', 'validation', 'test', tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST]. preprocessing_type: Which type of preprocessing to apply, either 'inception' or 'resnet'. use_bfloat16: Whether or not to use bfloat16 or float32 images. normalize_input: Whether or not to normalize images by the ImageNet mean and stddev. image_size: The size of the image in pixels. resnet_preprocessing_resize_method: Optional string for the resize method to use for resnet preprocessing. ensemble_size: `int` for number of ensemble members used in Mixup. one_hot: whether or not to use one-hot labels. mixup_params: hparams of mixup. run_mixup: An explicit flag of whether or not to run mixup if `mixup_params['mixup_alpha'] > 0`. By default, mixup will only be run in training mode if `mixup_params['mixup_alpha'] > 0`. **unused_kwargs: Ignored. """ name = 'imagenet2012' dataset_builder = tfds.builder(name, try_gcs=try_gcs) if is_training is None: is_training = split in ['train', tfds.Split.TRAIN] new_split = base.get_validation_percent_split( dataset_builder, validation_percent, split, test_split=tfds.Split.VALIDATION) if preprocessing_type == 'inception': decoders = { 'image': tfds.decode.SkipDecoding(), } else: decoders = None super(ImageNetDataset, self).__init__( name=name, dataset_builder=dataset_builder, split=new_split, is_training=is_training, shuffle_buffer_size=shuffle_buffer_size, num_parallel_parser_calls=num_parallel_parser_calls, fingerprint_key='file_name', download_data=download_data, decoders=decoders) self._preprocessing_type = preprocessing_type self._use_bfloat16 = use_bfloat16 self._normalize_input = normalize_input self._image_size = image_size self._resnet_preprocessing_resize_method = resnet_preprocessing_resize_method self._run_mixup = run_mixup self.ensemble_size = ensemble_size self._one_hot = one_hot if mixup_params is None: mixup_params = {} self._mixup_params = mixup_params