def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 66))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ X, y = get_partition(partition, train=self.train, valid=self.valid, test=self.test) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) def _process(*data): image = tf.cast(data[0], tf.float32) image = self.normalize_255(image) if inc_labels: label = tf.cast(data[1], tf.float32) if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=(image, label), mask=mask) return image, label return image ds = tf.data.Dataset.from_tensor_slices(X) if inc_labels: ds = tf.data.Dataset.zip( (ds, tf.data.Dataset.from_tensor_slices(y))) ds = ds.map(_process) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle), seed=seed, reshuffle_each_iteration=True) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=tf.data.experimental.AUTOTUNE, partition='train', inc_labels=True, seed=1) -> tf.data.Dataset: r""" Create tensorflow dataset for train, valid and test The images are normalized in range [-1, 1] Arguments: batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch. drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements; the default behavior is not to drop the smaller batch. shuffle: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample. If `None` or smaller or equal 0, turn off shuffling prefetch: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching. cache: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset. If a filename is not provided, the dataset will be cached in memory. If `None`, turn off caching parallel: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel. If not specified, elements will be processed sequentially. If the value `tf.data.experimental.AUTOTUNE` is used, then the number of parallel calls is set dynamically based on available CPU. partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 6))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ # both images and labels, note: a tuple must be used here types = (tf.float32, tf.float32) shapes = (tf.TensorShape(self.images.shape[1:]), tf.TensorShape(self.factors.shape[1:])) if not inc_labels: types = types[0] shapes = shapes[0] inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) def gen_data(indices): for i in indices: img = self.images[i] img = tf.cast(img, tf.float32) img = tf.clip_by_value(img / 255., 1e-6, 1. - 1e-6) if inc_labels: yield img, tf.cast(self.factors[i], dtype=tf.float32) else: yield img def process(*ims): r""" Normalizing the image to range [0., 1.] dtype tf.float32""" if inc_labels: ims, lab = ims if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(tf.shape(ims)[0], 1)) < inc_labels return dict(inputs=(ims, lab), mask=mask) return ims, lab return ims[0] ### get the right partition indices = get_partition( partition, train=self.train_indices, valid=self.valid_indices, test=self.test_indices, ) ds = tf.data.Dataset.from_generator(partial(gen_data, indices), output_types=types, output_shapes=shapes) ds = ds.batch(batch_size, drop_remainder).map(process, parallel) if cache is not None: ds = ds.cache(str(cache)) if shuffle is not None and shuffle > 0: ds = ds.shuffle(shuffle) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=tf.data.experimental.AUTOTUNE, partition='train', inc_labels=True, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'train_labelled', 'valid', 'test', 'unlabelled'} - 'train' : combination of both train and unlablled - 'train-labelled' : only the train data inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ image_size = self.image_size if isinstance(image_size, Number) and image_size == 96: image_size = None ### select partition images_path, labels_path = get_partition( partition, train=((self.bin_files['train_X'], self.bin_files['unlabeled_X']), self.bin_files['train_y']), train_labelled=(self.bin_files['train_X'], self.bin_files['train_y']), test=(self.bin_files['test_X'], self.bin_files['test_y']), unlabeled=(self.bin_files['unlabeled_X'], None), unlabelled=(self.bin_files['unlabeled_X'], None), ) X = [ np.reshape(np.fromfile(path, dtype=np.uint8), (-1, ) + STL10.IMAGE_SHAPE) for path in tf.nest.flatten(images_path) ] is_unlabelled = (labels_path is None) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) # load the labels if inc_labels: if is_unlabelled: y = [ np.zeros(shape=(X[0].shape[0], self.n_labels), dtype=np.float32) ] else: y = np.fromfile(labels_path, dtype=np.uint8) - 1 y = [one_hot(y, self.n_labels).astype(np.float32)] if len(X) == 2: # combined of both train and unlablled set y.append( np.zeros(shape=(X[1].shape[0], self.n_labels), dtype=np.float32)) assert len(y) == len(X) ### read and resize the data def resize(img): img = tf.cast(img, tf.float32) img = self.normalize_255(img) img = tf.transpose(img, perm=(2, 1, 0)) if image_size is not None: img = tf.image.resize(img, (image_size, image_size), preserve_aspect_ratio=True, antialias=False) return img def masking(image, label): mask = tf.logical_and( gen.uniform(shape=(1, )) < inc_labels, tf.reduce_sum(label) > 0.) return dict(inputs=(image, label), mask=mask) ### processing datasets = None must_masking = any(np.all(i == 0.) for i in y) for x_i, y_i in zip(X, y if inc_labels else X): images = tf.data.Dataset.from_tensor_slices(x_i).map( resize, parallel) if inc_labels: labels = tf.data.Dataset.from_tensor_slices(y_i) images = tf.data.Dataset.zip((images, labels)) if 0. < inc_labels < 1. or must_masking: # semi-supervised mask images = images.map(masking) datasets = images if datasets is None else datasets.concatenate( images) # cache data if cache is not None: datasets = datasets.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: datasets = datasets.shuffle(int(shuffle) * len(X)) datasets = datasets.batch(batch_size, drop_remainder) if prefetch is not None: datasets = datasets.prefetch(prefetch) # return return datasets
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=tf.data.experimental.AUTOTUNE, partition='train', inc_labels=True, seed=1) -> tf.data.Dataset: r""" The default argument will downsize and crop the image to square size (64, 64) Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 40))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ image_shape = self.original_shape image_size = self.image_size if image_size is not None: image_size = int(image_size) height = int(float(image_size) / image_shape[1] * image_shape[0]) # offset_height, offset_width, target_height, target_width crop_offset = ((height - image_size) // 2, 0, image_size, image_size) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) def read(path): img = tf.io.decode_jpeg(tf.io.read_file(path)) img.set_shape(image_shape) img = tf.cast(img, tf.float32) img = self.normalize_255(img) if image_size is not None: img = tf.image.resize(img, (height, image_size), preserve_aspect_ratio=True, antialias=False) if self.square_image: img = tf.image.crop_to_bounding_box(img, *crop_offset) return img def mask(image, label): mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=(image, label), mask=mask) ### select partition images, attrs = get_partition( partition, train=(self.train_files, self.train_attr), valid=(self.valid_files, self.valid_attr), test=(self.test_files, self.test_attr), ) # convert [-1, 1] to [0., 1.] attrs = (attrs + 1.) / 2 images = tf.data.Dataset.from_tensor_slices(images).map(read, parallel) if inc_labels: attrs = tf.data.Dataset.from_tensor_slices(attrs) images = tf.data.Dataset.zip((images, attrs)) if 0. < inc_labels < 1.: # semi-supervised mask images = images.map(mask) if cache is not None: images = images.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: images = images.shuffle(int(shuffle)) images = images.batch(batch_size, drop_remainder) if prefetch is not None: images = images.prefetch(prefetch) return images