コード例 #1
0
 def create_dataset(self,
                    batch_size=64,
                    drop_remainder=False,
                    shuffle=1000,
                    prefetch=tf.data.experimental.AUTOTUNE,
                    cache='',
                    parallel=None,
                    partition='train',
                    inc_labels=False,
                    seed=1) -> tf.data.Dataset:
     x = get_partition(partition,
                       train=self.train,
                       valid=self.valid,
                       test=self.test)
     x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())),
                         values=x.data,
                         dense_shape=x.shape)
     x = tf.data.Dataset.from_tensor_slices(x)
     if cache is not None:
         x = x.cache(str(cache))
     # shuffle must be called after cache
     if shuffle is not None and shuffle > 0:
         x = x.shuffle(int(shuffle))
     x = x.batch(batch_size, drop_remainder)
     x = x.map(lambda y: tf.cast(tf.sparse.to_dense(y), tf.float32))
     if prefetch is not None:
         x = x.prefetch(prefetch)
     return x
コード例 #2
0
ファイル: _base.py プロジェクト: sycomix/odin-ai
 def transform(
         self,
         documents: Optional[Union[str, List[str]]] = None) -> spmatrix:
     r""" Vectorize the input documents """
     # cached transformed dataset
     if isinstance(documents, string_types) and \
       documents in ('train', 'valid', 'test'):
         attr_name = f'_x_{documents}'
         if hasattr(self, attr_name):
             return getattr(self, attr_name)
         x = self.transform(
             get_partition(documents,
                           train=self.train_text,
                           valid=self.valid_text,
                           test=self.test_text))
         setattr(self, attr_name, x)
         return x
     # other data
     if self.algorithm in ('tf', 'tfidf', 'count'):
         x = self.tokenizer.transform(documents)
         # sorted ensure right ordering for Tensorflow SparseTensor
     else:
         if isinstance(documents, Generator):
             documents = [i for i in documents]
         x = sparse.csr_matrix(
             [i.ids for i in self.encode(documents, post_process=True)])
     return x
コード例 #3
0
 def create_dataset(self,
                    partition: Literal['train', 'valid', 'test'] = 'train',
                    *,
                    batch_size: Optional[int] = 32,
                    drop_remainder: bool = False,
                    shuffle: int = 1000,
                    cache: Optional[str] = '',
                    prefetch: Optional[int] = tf.data.experimental.AUTOTUNE,
                    parallel: Optional[int] = tf.data.experimental.AUTOTUNE,
                    label_percent: Union[bool, float] = False,
                    seed: int = 1) -> tf.data.Dataset:
     x = get_partition(partition,
                       train=self.train,
                       valid=self.valid,
                       test=self.test)
     x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())),
                         values=x.data,
                         dense_shape=x.shape)
     x = tf.data.Dataset.from_tensor_slices(x)
     if cache is not None:
         x = x.cache(str(cache))
     # shuffle must be called after cache
     if shuffle is not None and shuffle > 0:
         x = x.shuffle(int(shuffle),
                       seed=seed,
                       reshuffle_each_iteration=True)
     if batch_size is not None:
         x = x.batch(batch_size, drop_remainder)
     x = x.map(lambda y: tf.cast(tf.sparse.to_dense(y), tf.float32))
     if prefetch is not None:
         x = x.prefetch(prefetch)
     return x
コード例 #4
0
ファイル: _base.py プロジェクト: Daisey666/odin-ai
    def create_dataset(self,
                       batch_size: int = 64,
                       drop_remainder: bool = False,
                       shuffle: Optional[int] = 1000,
                       prefetch: int = tf.data.experimental.AUTOTUNE,
                       cache: str = '',
                       parallel: Optional[int] = None,
                       partition: str = 'train',
                       inc_labels: bool = False,
                       seed: int = 1) -> tf.data.Dataset:
        for attr in ('x', 'y', 'xvar', 'yvar'):
            assert hasattr(self, attr)
            assert getattr(self, attr) is not None
        # split train, valid, test data
        if not hasattr(self, 'train_ids') or self.train_ids is None:
            rand = np.random.RandomState(seed=1)
            n = self.x.shape[0]
            ids = rand.permutation(n)
            self.train_ids = ids[:int(0.85 * n)]
            self.valid_ids = ids[int(0.85 * n):int(0.9 * n)]
            self.test_ids = ids[int(0.9 * n):]
        ids = get_partition(partition,
                            train=self.train_ids,
                            valid=self.valid_ids,
                            test=self.test_ids)
        is_sparse_x = isinstance(self.x, sparse.spmatrix)
        is_sparse_y = isinstance(self.y, sparse.spmatrix)
        x = _tensor(self.x[ids])
        y = _tensor(self.y[ids])
        gen = tf.random.experimental.Generator.from_seed(seed=seed)

        def _process(*data):
            data = list(data)
            if is_sparse_x:
                data[0] = tf.sparse.to_dense(data[0])
            if is_sparse_y and len(data) > 1:
                data[1] = tf.sparse.to_dense(data[1])
            data = tuple(data)
            if inc_labels:
                if 0. < inc_labels < 1.:  # semi-supervised mask
                    mask = gen.uniform(shape=(1, )) < inc_labels
                    return dict(inputs=data, mask=mask)
            return data[0] if len(data) == 1 else data

        ds = x
        if inc_labels > 0.:
            ds = tf.data.Dataset.zip((x, y))
        ds = ds.map(_process, parallel)
        if cache is not None:
            ds = ds.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(int(shuffle),
                            seed=seed,
                            reshuffle_each_iteration=True)
        ds = ds.batch(batch_size, drop_remainder)
        if prefetch is not None:
            ds = ds.prefetch(prefetch)
        return ds
コード例 #5
0
ファイル: _base.py プロジェクト: trungnt13/odin-ai
 def _build_stratified_map(self, partition) -> Dict[int, List[int]]:
     name = f'_{self.name}_{partition}'
     path = os.path.join(get_cache_path(), name)
     if not os.path.exists(path):
         ds = get_partition(partition,
                            train=self.train,
                            valid=self.valid,
                            test=self.test)
         y_map = defaultdict(list)
         for i, (_, y) in enumerate(ds):
             y_map[np.argmax(y) if y.shape.ndims > 0 else y.numpy()].append(
                 i)
         with open(path, 'wb') as f:
             pickle.dump(y_map, f)
         setattr(self, name, y_map)
     if not hasattr(self, name):
         with open(path, 'rb') as f:
             setattr(self, name, pickle.load(f))
     return getattr(self, name)
コード例 #6
0
  def create_dataset(self,
                     batch_size=64,
                     drop_remainder=False,
                     shuffle=1000,
                     prefetch=tf.data.experimental.AUTOTUNE,
                     cache='',
                     parallel=None,
                     partition='train',
                     inc_labels=False,
                     seed=1) -> tf.data.Dataset:
    r"""
    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 28, 28, 1))`
        label - `(tf.float32, (None, 10))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
    ds = get_partition(partition,
                       train=self.train,
                       valid=self.valid,
                       test=self.test)
    struct = tf.data.experimental.get_structure(ds)
    if len(struct) == 1:
      inc_labels = False
    ids = tf.range(self.n_labels, dtype=tf.float32)
    inc_labels = float(inc_labels)
    gen = tf.random.experimental.Generator.from_seed(seed=seed)

    def _process_dict(data):
      image = tf.cast(data['image'], tf.float32)
      if not self.is_binary:
        image = self.normalize_255(image)
      if inc_labels:
        label = tf.cast(data['label'], tf.float32)
        if len(label.shape) == 0:  # covert to one-hot
          label = tf.cast(ids == label, tf.float32)
        if 0. < inc_labels < 1.:  # semi-supervised mask
          mask = gen.uniform(shape=(1,)) < inc_labels
          return dict(inputs=(image, label), mask=mask)
        return image, label
      return image

    def _process_tuple(*data):
      image = tf.cast(data[0], tf.float32)
      if not self.is_binary:
        image = self.normalize_255(image)
      if inc_labels:
        label = tf.cast(data[1], tf.float32)
        if len(label.shape) == 0:  # covert to one-hot
          label = tf.cast(ids == label, tf.float32)
        if 0. < inc_labels < 1.:  # semi-supervised mask
          mask = gen.uniform(shape=(1,)) < inc_labels
          return dict(inputs=(image, label), mask=mask)
        return image, label
      return image

    ds = ds.map(_process_dict if isinstance(struct, dict) else _process_tuple,
                parallel)
    if cache is not None:
      ds = ds.cache(str(cache))
    # shuffle must be called after cache
    if shuffle is not None and shuffle > 0:
      ds = ds.shuffle(int(shuffle))
    ds = ds.batch(batch_size, drop_remainder)
    if prefetch is not None:
      ds = ds.prefetch(prefetch)
    return ds
コード例 #7
0
ファイル: _base.py プロジェクト: sycomix/odin-ai
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=None,
                       partition='train',
                       inc_labels=False,
                       seed=1) -> tf.data.Dataset:
        r"""
    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 1))`
        label - `(tf.float32, (None, 5))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)
        x = self.transform(partition)
        y = get_partition(partition,
                          train=self.train_labels,
                          valid=self.valid_labels,
                          test=self.test_labels)
        # remove empty docs
        indices = np.array(np.sum(x, axis=-1) > 0).ravel()
        x = x[indices]
        if len(y) > 0:
            y = y[indices]
        # convert to one-hot
        if inc_labels > 0 and len(y) > 0 and y.ndim == 1:
            y = one_hot(y, self.n_labels)

        def _process(*data):
            data = tuple([
                tf.cast(
                    tf.sparse.to_dense(i)
                    if isinstance(i, tf.SparseTensor) else i, tf.float32)
                for i in data
            ])
            if inc_labels:
                if 0. < inc_labels < 1.:  # semi-supervised mask
                    mask = gen.uniform(shape=(1, )) < inc_labels
                    return dict(inputs=tuple(data), mask=mask)
                return data
            return data[0]

        # prepare the sparse matrices
        if isinstance(x, spmatrix):
            x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())),
                                values=x.data,
                                dense_shape=x.shape)
        ds = tf.data.Dataset.from_tensor_slices(x)
        if inc_labels > 0:
            if isinstance(y, spmatrix):
                y = tf.SparseTensor(indices=sorted(zip(*y.nonzero())),
                                    values=y.data,
                                    dense_shape=y.shape)
            y = tf.data.Dataset.from_tensor_slices(y)
            ds = tf.data.Dataset.zip((ds, y))
        # configurate dataset
        ds = ds.map(_process, parallel)
        if cache is not None:
            ds = ds.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(int(shuffle))
        ds = ds.batch(batch_size, drop_remainder)
        if prefetch is not None:
            ds = ds.prefetch(prefetch)
        return ds
コード例 #8
0
ファイル: _base.py プロジェクト: trungnt13/odin-ai
    def create_dataset(
        self,
        partition: Partition = 'train',
        *,
        batch_size: Optional[int] = 64,
        drop_remainder: bool = False,
        shuffle: int = 1000,
        cache: Optional[str] = '',
        prefetch: Optional[int] = tf.data.AUTOTUNE,
        parallel: Optional[int] = tf.data.AUTOTUNE,
        label_percent: Union[bool, int, float] = 0.0,
        oversample_ratio: Union[bool, float] = 0.0,
        fixed_oversample: bool = False,
        normalize: Literal['probs', 'tanh', 'raster'] = 'probs',
        seed: int = 1,
    ) -> tf.data.Dataset:
        """Create `tensorflow.data.Dataset` for the loaded dataset

    Parameters
    ----------
    partition : {'train', 'valid', 'test'}
        [description], by default 'train'
    batch_size : Optional[int], optional
        [description], by default 100
    drop_remainder : bool, optional
        [description], by default False
    shuffle : int, optional
        [description], by default 1000
    cache : Optional[str], optional
        [description], by default ''
    prefetch : Optional[int], optional
        [description], by default tf.data.AUTOTUNE
    parallel : Optional[int], optional
        [description], by default tf.data.AUTOTUNE
    label_percent : Union[bool, int, float], optional
        If `1.0` or `True`, return both image and label, i.e. supervised task.
        If `0.0`or `False`, only image is returned, i.e. unsupervised task.
        If a scalar in `(0, 1)`, it indicate the percent of labelled data,
        i.e. semi-supervised task.
        If an integer `> 1`, exact number of labelled samples, by default `0.0`
    oversample_ratio : Union[bool, float], optional
        a float number within the range `[0, 1]`, indicate the ratio between
        unlabel and label data in minibatch.
        If `0` or `False`, use the default label-unlabel ratio.
        If `1` or `True`, repeat all the label data every minibatch.
        Otherwise, the number is the percent of labeled data for each minibatch,
        by default 0.0.
    fixed_oversample : bool
        if `True`, the amount of labeled sample remain the same in each
        minibatch after oversampling, by default `True`
    normalize : Literal['probs', 'tanh', 'raster']
        image normalizing method, by default 'probs'
    seed : int, optional
        [description], by default 1

    Note
    ----
    `fixed_oversample` trigger the whole supervised dataset cached in memory,
    which could lead to OOM on GPU.

    Returns
    -------
    If `0. < label_percent < 1.`, return a dictionary
      image - `(tf.float32, (None, 28, 28, 1))`
      label - `(tf.float32, (None, 10))`
      mask  - `(tf.bool, (None, 1))`
    if `label_percent = 0`, return single image
    if `label_percent = 1`, return tuple of (image, label)
    """
        ds = get_partition(partition,
                           train=self.train,
                           valid=self.valid,
                           test=self.test)
        ######## check labels available
        struct = as_tuple(tf.data.experimental.get_structure(ds))
        has_labels = False
        if len(struct) == 1:
            label_percent = 0.0
        else:
            has_labels = True
        label_percent = float(label_percent)
        assert 0. <= oversample_ratio <= 1., \
          f'oversample_ratio must be in [0, 1] given: {oversample_ratio}'
        # which task
        task = 'unsupervised'
        if label_percent == 1.0:
            task = 'supervised'
        elif 0. < label_percent < 1. or label_percent > 1.:
            task = 'semi'
        ######## prepare the labeled data
        rand = np.random.RandomState(seed=seed)
        length = tf.data.experimental.cardinality(ds).numpy()
        x_labeled, y_labeled, mask_labeled, ds_supervised = [], [], None, None
        if task == 'semi':
            n_labeled = int(label_percent * length \
                              if 0. < label_percent < 1. else int(label_percent))
            n_unlabeled = length - n_labeled
            n_per_classes = int(n_labeled / len(self.labels))
            # for binary labels we could do stratified sampling
            if self.label_type == 'categorical':
                y_map = self._build_stratified_map(partition)
                labeled_ids = np.stack([
                    rand.choice(v, size=n_per_classes, replace=False)
                    for k, v in y_map.items()
                ])
                is_labeled = np.full((length, ), False, dtype=np.bool)
                is_labeled[labeled_ids] = True
            # just pseudo-random sampling
            else:
                is_labeled = np.array([True] * n_labeled + [False] *
                                      (length - n_labeled))
                rand.shuffle(is_labeled)
            # add labeling flag to the dataset
            ds = tf.data.Dataset.zip(
                (tf.data.Dataset.from_tensor_slices(is_labeled), ds))
            # repeat the label data in every minibatch
            if oversample_ratio == 1.0:
                x_labeled, y_labeled = _extract_labeled_examples(
                    ds,
                    n_labeled=n_labeled,
                    normalize_method=partial(self.normalize,
                                             normalize=normalize))
                if y_labeled.shape.ndims == 1:
                    y_labeled = tf.one_hot(y_labeled, len(self.labels))
                mask_labeled = tf.cast(tf.ones([x_labeled.shape[0]]), tf.bool)
                ds = ds.filter(lambda i, x: tf.logical_not(i))
            # mixing the label into minibatch
            elif oversample_ratio > 0.0:
                ds_unsupervised = ds.filter(lambda i, x: tf.logical_not(i))
                # only sampling if not fixed amount of labels per minibatch
                if not fixed_oversample:
                    ds_supervised = ds.filter(lambda i, x: i)
                    ds_supervised = ds_supervised.repeat(
                        int(np.ceil(n_unlabeled / n_labeled)))
                    ds = tf.data.experimental.sample_from_datasets(
                        [ds_unsupervised, ds_supervised],
                        weights=[1. - oversample_ratio, oversample_ratio],
                        seed=seed)
                # cache the labeled data for fixed amount of labels per minibatch
                else:
                    # for some reason sample_from_datasets significantly slowed down
                    # if we sample from a single dataset that splitted into two by
                    # filtering, and one of which is repeated
                    # (e.g. 7000 samples/s dropped down to 1000 samples/s)
                    x_labeled, y_labeled = _extract_labeled_examples(
                        ds, n_labeled=n_labeled, normalize_method=None)
                    mask_labeled = tf.cast(tf.ones([x_labeled.shape[0]]),
                                           tf.bool)
                    ds_supervised = tf.data.Dataset.from_tensor_slices(
                        (mask_labeled, (x_labeled, y_labeled)))
                    n_repeat = int(
                        np.ceil(oversample_ratio * n_unlabeled /
                                (1 - oversample_ratio) / n_labeled))
                    ds_supervised = ds_supervised.shuffle(
                        min(n_labeled, 1000),
                        seed=seed,
                        reshuffle_each_iteration=True,
                    ).repeat(n_repeat)
                    ds = ds_unsupervised
            # default ratio
            else:
                fixed_oversample = False
        ######## other cases
        elif task == 'unsupervised':
            ds = ds.map(lambda *x: (False, x))
        elif task == 'supervised':
            ds = ds.map(lambda *x: (True, x))
        else:
            raise ValueError(f'Unknown task type "{task}".')

        def _process(mask, data):
            images = tf.cast(data[0], tf.float32)
            # normalize the image
            images = self.normalize(images, normalize)
            if has_labels:
                labels = data[1]
                # covert to one-hot
                if len(labels.shape) == 1:
                    labels = tf.one_hot(labels, len(self.labels))
            # unsupervised task
            if task == 'unsupervised':
                return images
            # supervised task
            elif task == 'supervised':
                return images, labels
            # semi-supervised task
            if oversample_ratio == 1.0:
                return images, x_labeled, y_labeled
            X_sup = tf.boolean_mask(images, mask, 0)
            y_sup = tf.boolean_mask(labels, mask, 0)
            X_uns = tf.boolean_mask(images, tf.logical_not(mask), 0)
            return X_uns, X_sup, y_sup

        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(buffer_size=int(shuffle),
                            seed=seed,
                            reshuffle_each_iteration=True)
        # for mixing unsupervised and supervised data
        if task == 'semi' and fixed_oversample:
            if shuffle is not None and shuffle > 0:
                ds_supervised = ds_supervised.shuffle(
                    buffer_size=int(shuffle),
                    seed=seed,
                    reshuffle_each_iteration=True)
            if batch_size is not None and batch_size > 0:
                n_sup = int(np.ceil(batch_size * oversample_ratio))
                batch_size = batch_size - n_sup
                ds_supervised = ds_supervised.batch(
                    n_sup, drop_remainder=drop_remainder)
                ds = ds.batch(batch_size, drop_remainder=drop_remainder)
            ds = tf.data.Dataset.zip((ds, ds_supervised))

            def merge_semi(uns, sup):
                m_uns, (x_uns, y_uns) = uns
                m_sup, (x_sup, y_sup) = sup
                return tf.concat([m_uns, m_sup],
                                 0), (tf.concat([x_uns, x_sup],
                                                0), tf.concat([y_uns, y_sup],
                                                              0))

            ds = ds.map(merge_semi)
        # process as normal
        elif batch_size is not None and batch_size > 0:
            ds = ds.batch(batch_size, drop_remainder=drop_remainder)
        # map cache and prefetch
        ds = ds.map(_process, num_parallel_calls=parallel)
        if cache is not None:
            ds = ds.cache(filename=str(cache))
        if prefetch is not None:
            ds = ds.prefetch(buffer_size=prefetch)
        ds: tf.data.Dataset
        return ds