Exemple #1
0
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=None,
                       partition='train',
                       inc_labels=False,
                       seed=1) -> tf.data.Dataset:
        r"""
    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 66))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        X, y = get_partition(partition,
                             train=self.train,
                             valid=self.valid,
                             test=self.test)
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)

        def _process(*data):
            image = tf.cast(data[0], tf.float32)
            image = self.normalize_255(image)
            if inc_labels:
                label = tf.cast(data[1], tf.float32)
                if 0. < inc_labels < 1.:  # semi-supervised mask
                    mask = gen.uniform(shape=(1, )) < inc_labels
                    return dict(inputs=(image, label), mask=mask)
                return image, label
            return image

        ds = tf.data.Dataset.from_tensor_slices(X)
        if inc_labels:
            ds = tf.data.Dataset.zip(
                (ds, tf.data.Dataset.from_tensor_slices(y)))
        ds = ds.map(_process)
        if cache is not None:
            ds = ds.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(int(shuffle),
                            seed=seed,
                            reshuffle_each_iteration=True)
        ds = ds.batch(batch_size, drop_remainder)
        if prefetch is not None:
            ds = ds.prefetch(prefetch)
        return ds
Exemple #2
0
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=tf.data.experimental.AUTOTUNE,
                       partition='train',
                       inc_labels=True,
                       seed=1) -> tf.data.Dataset:
        r""" Create tensorflow dataset for train, valid and test
      The images are normalized in range [-1, 1]

    Arguments:
      batch_size: A tf.int64 scalar tf.Tensor, representing the number of
        consecutive elements of this dataset to combine in a single batch.
      drop_remainder: A tf.bool scalar tf.Tensor, representing whether the
        last batch should be dropped in the case it has fewer than batch_size
        elements; the default behavior is not to drop the smaller batch.
      shuffle: A tf.int64 scalar tf.Tensor, representing the number of elements
        from this dataset from which the new dataset will sample.
        If `None` or smaller or equal 0, turn off shuffling
      prefetch:  A tf.int64 scalar tf.Tensor, representing the maximum number
        of elements that will be buffered when prefetching.
      cache: A tf.string scalar tf.Tensor, representing the name of a directory
        on the filesystem to use for caching elements in this Dataset. If a
        filename is not provided, the dataset will be cached in memory.
        If `None`, turn off caching
      parallel: A tf.int32 scalar tf.Tensor, representing the number elements
        to process asynchronously in parallel. If not specified, elements will
        be processed sequentially. If the value `tf.data.experimental.AUTOTUNE`
        is used, then the number of parallel calls is set dynamically based
        on available CPU.
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 6))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        # both images and labels, note: a tuple must be used here
        types = (tf.float32, tf.float32)
        shapes = (tf.TensorShape(self.images.shape[1:]),
                  tf.TensorShape(self.factors.shape[1:]))
        if not inc_labels:
            types = types[0]
            shapes = shapes[0]
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)

        def gen_data(indices):
            for i in indices:
                img = self.images[i]
                img = tf.cast(img, tf.float32)
                img = tf.clip_by_value(img / 255., 1e-6, 1. - 1e-6)
                if inc_labels:
                    yield img, tf.cast(self.factors[i], dtype=tf.float32)
                else:
                    yield img

        def process(*ims):
            r""" Normalizing the image to range [0., 1.] dtype tf.float32"""
            if inc_labels:
                ims, lab = ims
                if 0. < inc_labels < 1.:  # semi-supervised mask
                    mask = gen.uniform(shape=(tf.shape(ims)[0],
                                              1)) < inc_labels
                    return dict(inputs=(ims, lab), mask=mask)
                return ims, lab
            return ims[0]

        ### get the right partition
        indices = get_partition(
            partition,
            train=self.train_indices,
            valid=self.valid_indices,
            test=self.test_indices,
        )
        ds = tf.data.Dataset.from_generator(partial(gen_data, indices),
                                            output_types=types,
                                            output_shapes=shapes)
        ds = ds.batch(batch_size, drop_remainder).map(process, parallel)
        if cache is not None:
            ds = ds.cache(str(cache))
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(shuffle)
        if prefetch is not None:
            ds = ds.prefetch(prefetch)
        return ds
Exemple #3
0
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=tf.data.experimental.AUTOTUNE,
                       partition='train',
                       inc_labels=True,
                       seed=1) -> tf.data.Dataset:
        r"""
    Arguments:
      partition : {'train', 'train_labelled', 'valid', 'test', 'unlabelled'}
        - 'train' : combination of both train and unlablled
        - 'train-labelled' : only the train data
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 10))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        image_size = self.image_size
        if isinstance(image_size, Number) and image_size == 96:
            image_size = None
        ### select partition
        images_path, labels_path = get_partition(
            partition,
            train=((self.bin_files['train_X'], self.bin_files['unlabeled_X']),
                   self.bin_files['train_y']),
            train_labelled=(self.bin_files['train_X'],
                            self.bin_files['train_y']),
            test=(self.bin_files['test_X'], self.bin_files['test_y']),
            unlabeled=(self.bin_files['unlabeled_X'], None),
            unlabelled=(self.bin_files['unlabeled_X'], None),
        )

        X = [
            np.reshape(np.fromfile(path, dtype=np.uint8),
                       (-1, ) + STL10.IMAGE_SHAPE)
            for path in tf.nest.flatten(images_path)
        ]
        is_unlabelled = (labels_path is None)
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)
        # load the labels
        if inc_labels:
            if is_unlabelled:
                y = [
                    np.zeros(shape=(X[0].shape[0], self.n_labels),
                             dtype=np.float32)
                ]
            else:
                y = np.fromfile(labels_path, dtype=np.uint8) - 1
                y = [one_hot(y, self.n_labels).astype(np.float32)]
                if len(X) == 2:  # combined of both train and unlablled set
                    y.append(
                        np.zeros(shape=(X[1].shape[0], self.n_labels),
                                 dtype=np.float32))
            assert len(y) == len(X)

        ### read and resize the data
        def resize(img):
            img = tf.cast(img, tf.float32)
            img = self.normalize_255(img)
            img = tf.transpose(img, perm=(2, 1, 0))
            if image_size is not None:
                img = tf.image.resize(img, (image_size, image_size),
                                      preserve_aspect_ratio=True,
                                      antialias=False)
            return img

        def masking(image, label):
            mask = tf.logical_and(
                gen.uniform(shape=(1, )) < inc_labels,
                tf.reduce_sum(label) > 0.)
            return dict(inputs=(image, label), mask=mask)

        ### processing
        datasets = None
        must_masking = any(np.all(i == 0.) for i in y)
        for x_i, y_i in zip(X, y if inc_labels else X):
            images = tf.data.Dataset.from_tensor_slices(x_i).map(
                resize, parallel)
            if inc_labels:
                labels = tf.data.Dataset.from_tensor_slices(y_i)
                images = tf.data.Dataset.zip((images, labels))
                if 0. < inc_labels < 1. or must_masking:  # semi-supervised mask
                    images = images.map(masking)
            datasets = images if datasets is None else datasets.concatenate(
                images)
        # cache data
        if cache is not None:
            datasets = datasets.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            datasets = datasets.shuffle(int(shuffle) * len(X))
        datasets = datasets.batch(batch_size, drop_remainder)
        if prefetch is not None:
            datasets = datasets.prefetch(prefetch)
        # return
        return datasets
Exemple #4
0
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=tf.data.experimental.AUTOTUNE,
                       partition='train',
                       inc_labels=True,
                       seed=1) -> tf.data.Dataset:
        r""" The default argument will downsize and crop the image to square size
    (64, 64)

    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 40))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        image_shape = self.original_shape
        image_size = self.image_size
        if image_size is not None:
            image_size = int(image_size)
            height = int(float(image_size) / image_shape[1] * image_shape[0])
            # offset_height, offset_width, target_height, target_width
            crop_offset = ((height - image_size) // 2, 0, image_size,
                           image_size)
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)

        def read(path):
            img = tf.io.decode_jpeg(tf.io.read_file(path))
            img.set_shape(image_shape)
            img = tf.cast(img, tf.float32)
            img = self.normalize_255(img)
            if image_size is not None:
                img = tf.image.resize(img, (height, image_size),
                                      preserve_aspect_ratio=True,
                                      antialias=False)
                if self.square_image:
                    img = tf.image.crop_to_bounding_box(img, *crop_offset)
            return img

        def mask(image, label):
            mask = gen.uniform(shape=(1, )) < inc_labels
            return dict(inputs=(image, label), mask=mask)

        ### select partition
        images, attrs = get_partition(
            partition,
            train=(self.train_files, self.train_attr),
            valid=(self.valid_files, self.valid_attr),
            test=(self.test_files, self.test_attr),
        )
        # convert [-1, 1] to [0., 1.]
        attrs = (attrs + 1.) / 2
        images = tf.data.Dataset.from_tensor_slices(images).map(read, parallel)
        if inc_labels:
            attrs = tf.data.Dataset.from_tensor_slices(attrs)
            images = tf.data.Dataset.zip((images, attrs))
            if 0. < inc_labels < 1.:  # semi-supervised mask
                images = images.map(mask)

        if cache is not None:
            images = images.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            images = images.shuffle(int(shuffle))
        images = images.batch(batch_size, drop_remainder)
        if prefetch is not None:
            images = images.prefetch(prefetch)
        return images