def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: x = get_partition(partition, train=self.train, valid=self.valid, test=self.test) x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())), values=x.data, dense_shape=x.shape) x = tf.data.Dataset.from_tensor_slices(x) if cache is not None: x = x.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: x = x.shuffle(int(shuffle)) x = x.batch(batch_size, drop_remainder) x = x.map(lambda y: tf.cast(tf.sparse.to_dense(y), tf.float32)) if prefetch is not None: x = x.prefetch(prefetch) return x
def transform( self, documents: Optional[Union[str, List[str]]] = None) -> spmatrix: r""" Vectorize the input documents """ # cached transformed dataset if isinstance(documents, string_types) and \ documents in ('train', 'valid', 'test'): attr_name = f'_x_{documents}' if hasattr(self, attr_name): return getattr(self, attr_name) x = self.transform( get_partition(documents, train=self.train_text, valid=self.valid_text, test=self.test_text)) setattr(self, attr_name, x) return x # other data if self.algorithm in ('tf', 'tfidf', 'count'): x = self.tokenizer.transform(documents) # sorted ensure right ordering for Tensorflow SparseTensor else: if isinstance(documents, Generator): documents = [i for i in documents] x = sparse.csr_matrix( [i.ids for i in self.encode(documents, post_process=True)]) return x
def create_dataset(self, partition: Literal['train', 'valid', 'test'] = 'train', *, batch_size: Optional[int] = 32, drop_remainder: bool = False, shuffle: int = 1000, cache: Optional[str] = '', prefetch: Optional[int] = tf.data.experimental.AUTOTUNE, parallel: Optional[int] = tf.data.experimental.AUTOTUNE, label_percent: Union[bool, float] = False, seed: int = 1) -> tf.data.Dataset: x = get_partition(partition, train=self.train, valid=self.valid, test=self.test) x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())), values=x.data, dense_shape=x.shape) x = tf.data.Dataset.from_tensor_slices(x) if cache is not None: x = x.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: x = x.shuffle(int(shuffle), seed=seed, reshuffle_each_iteration=True) if batch_size is not None: x = x.batch(batch_size, drop_remainder) x = x.map(lambda y: tf.cast(tf.sparse.to_dense(y), tf.float32)) if prefetch is not None: x = x.prefetch(prefetch) return x
def create_dataset(self, batch_size: int = 64, drop_remainder: bool = False, shuffle: Optional[int] = 1000, prefetch: int = tf.data.experimental.AUTOTUNE, cache: str = '', parallel: Optional[int] = None, partition: str = 'train', inc_labels: bool = False, seed: int = 1) -> tf.data.Dataset: for attr in ('x', 'y', 'xvar', 'yvar'): assert hasattr(self, attr) assert getattr(self, attr) is not None # split train, valid, test data if not hasattr(self, 'train_ids') or self.train_ids is None: rand = np.random.RandomState(seed=1) n = self.x.shape[0] ids = rand.permutation(n) self.train_ids = ids[:int(0.85 * n)] self.valid_ids = ids[int(0.85 * n):int(0.9 * n)] self.test_ids = ids[int(0.9 * n):] ids = get_partition(partition, train=self.train_ids, valid=self.valid_ids, test=self.test_ids) is_sparse_x = isinstance(self.x, sparse.spmatrix) is_sparse_y = isinstance(self.y, sparse.spmatrix) x = _tensor(self.x[ids]) y = _tensor(self.y[ids]) gen = tf.random.experimental.Generator.from_seed(seed=seed) def _process(*data): data = list(data) if is_sparse_x: data[0] = tf.sparse.to_dense(data[0]) if is_sparse_y and len(data) > 1: data[1] = tf.sparse.to_dense(data[1]) data = tuple(data) if inc_labels: if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=data, mask=mask) return data[0] if len(data) == 1 else data ds = x if inc_labels > 0.: ds = tf.data.Dataset.zip((x, y)) ds = ds.map(_process, parallel) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle), seed=seed, reshuffle_each_iteration=True) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def _build_stratified_map(self, partition) -> Dict[int, List[int]]: name = f'_{self.name}_{partition}' path = os.path.join(get_cache_path(), name) if not os.path.exists(path): ds = get_partition(partition, train=self.train, valid=self.valid, test=self.test) y_map = defaultdict(list) for i, (_, y) in enumerate(ds): y_map[np.argmax(y) if y.shape.ndims > 0 else y.numpy()].append( i) with open(path, 'wb') as f: pickle.dump(y_map, f) setattr(self, name, y_map) if not hasattr(self, name): with open(path, 'rb') as f: setattr(self, name, pickle.load(f)) return getattr(self, name)
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 28, 28, 1))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ ds = get_partition(partition, train=self.train, valid=self.valid, test=self.test) struct = tf.data.experimental.get_structure(ds) if len(struct) == 1: inc_labels = False ids = tf.range(self.n_labels, dtype=tf.float32) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) def _process_dict(data): image = tf.cast(data['image'], tf.float32) if not self.is_binary: image = self.normalize_255(image) if inc_labels: label = tf.cast(data['label'], tf.float32) if len(label.shape) == 0: # covert to one-hot label = tf.cast(ids == label, tf.float32) if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1,)) < inc_labels return dict(inputs=(image, label), mask=mask) return image, label return image def _process_tuple(*data): image = tf.cast(data[0], tf.float32) if not self.is_binary: image = self.normalize_255(image) if inc_labels: label = tf.cast(data[1], tf.float32) if len(label.shape) == 0: # covert to one-hot label = tf.cast(ids == label, tf.float32) if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1,)) < inc_labels return dict(inputs=(image, label), mask=mask) return image, label return image ds = ds.map(_process_dict if isinstance(struct, dict) else _process_tuple, parallel) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle)) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 1))` label - `(tf.float32, (None, 5))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) x = self.transform(partition) y = get_partition(partition, train=self.train_labels, valid=self.valid_labels, test=self.test_labels) # remove empty docs indices = np.array(np.sum(x, axis=-1) > 0).ravel() x = x[indices] if len(y) > 0: y = y[indices] # convert to one-hot if inc_labels > 0 and len(y) > 0 and y.ndim == 1: y = one_hot(y, self.n_labels) def _process(*data): data = tuple([ tf.cast( tf.sparse.to_dense(i) if isinstance(i, tf.SparseTensor) else i, tf.float32) for i in data ]) if inc_labels: if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=tuple(data), mask=mask) return data return data[0] # prepare the sparse matrices if isinstance(x, spmatrix): x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())), values=x.data, dense_shape=x.shape) ds = tf.data.Dataset.from_tensor_slices(x) if inc_labels > 0: if isinstance(y, spmatrix): y = tf.SparseTensor(indices=sorted(zip(*y.nonzero())), values=y.data, dense_shape=y.shape) y = tf.data.Dataset.from_tensor_slices(y) ds = tf.data.Dataset.zip((ds, y)) # configurate dataset ds = ds.map(_process, parallel) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle)) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def create_dataset( self, partition: Partition = 'train', *, batch_size: Optional[int] = 64, drop_remainder: bool = False, shuffle: int = 1000, cache: Optional[str] = '', prefetch: Optional[int] = tf.data.AUTOTUNE, parallel: Optional[int] = tf.data.AUTOTUNE, label_percent: Union[bool, int, float] = 0.0, oversample_ratio: Union[bool, float] = 0.0, fixed_oversample: bool = False, normalize: Literal['probs', 'tanh', 'raster'] = 'probs', seed: int = 1, ) -> tf.data.Dataset: """Create `tensorflow.data.Dataset` for the loaded dataset Parameters ---------- partition : {'train', 'valid', 'test'} [description], by default 'train' batch_size : Optional[int], optional [description], by default 100 drop_remainder : bool, optional [description], by default False shuffle : int, optional [description], by default 1000 cache : Optional[str], optional [description], by default '' prefetch : Optional[int], optional [description], by default tf.data.AUTOTUNE parallel : Optional[int], optional [description], by default tf.data.AUTOTUNE label_percent : Union[bool, int, float], optional If `1.0` or `True`, return both image and label, i.e. supervised task. If `0.0`or `False`, only image is returned, i.e. unsupervised task. If a scalar in `(0, 1)`, it indicate the percent of labelled data, i.e. semi-supervised task. If an integer `> 1`, exact number of labelled samples, by default `0.0` oversample_ratio : Union[bool, float], optional a float number within the range `[0, 1]`, indicate the ratio between unlabel and label data in minibatch. If `0` or `False`, use the default label-unlabel ratio. If `1` or `True`, repeat all the label data every minibatch. Otherwise, the number is the percent of labeled data for each minibatch, by default 0.0. fixed_oversample : bool if `True`, the amount of labeled sample remain the same in each minibatch after oversampling, by default `True` normalize : Literal['probs', 'tanh', 'raster'] image normalizing method, by default 'probs' seed : int, optional [description], by default 1 Note ---- `fixed_oversample` trigger the whole supervised dataset cached in memory, which could lead to OOM on GPU. Returns ------- If `0. < label_percent < 1.`, return a dictionary image - `(tf.float32, (None, 28, 28, 1))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if `label_percent = 0`, return single image if `label_percent = 1`, return tuple of (image, label) """ ds = get_partition(partition, train=self.train, valid=self.valid, test=self.test) ######## check labels available struct = as_tuple(tf.data.experimental.get_structure(ds)) has_labels = False if len(struct) == 1: label_percent = 0.0 else: has_labels = True label_percent = float(label_percent) assert 0. <= oversample_ratio <= 1., \ f'oversample_ratio must be in [0, 1] given: {oversample_ratio}' # which task task = 'unsupervised' if label_percent == 1.0: task = 'supervised' elif 0. < label_percent < 1. or label_percent > 1.: task = 'semi' ######## prepare the labeled data rand = np.random.RandomState(seed=seed) length = tf.data.experimental.cardinality(ds).numpy() x_labeled, y_labeled, mask_labeled, ds_supervised = [], [], None, None if task == 'semi': n_labeled = int(label_percent * length \ if 0. < label_percent < 1. else int(label_percent)) n_unlabeled = length - n_labeled n_per_classes = int(n_labeled / len(self.labels)) # for binary labels we could do stratified sampling if self.label_type == 'categorical': y_map = self._build_stratified_map(partition) labeled_ids = np.stack([ rand.choice(v, size=n_per_classes, replace=False) for k, v in y_map.items() ]) is_labeled = np.full((length, ), False, dtype=np.bool) is_labeled[labeled_ids] = True # just pseudo-random sampling else: is_labeled = np.array([True] * n_labeled + [False] * (length - n_labeled)) rand.shuffle(is_labeled) # add labeling flag to the dataset ds = tf.data.Dataset.zip( (tf.data.Dataset.from_tensor_slices(is_labeled), ds)) # repeat the label data in every minibatch if oversample_ratio == 1.0: x_labeled, y_labeled = _extract_labeled_examples( ds, n_labeled=n_labeled, normalize_method=partial(self.normalize, normalize=normalize)) if y_labeled.shape.ndims == 1: y_labeled = tf.one_hot(y_labeled, len(self.labels)) mask_labeled = tf.cast(tf.ones([x_labeled.shape[0]]), tf.bool) ds = ds.filter(lambda i, x: tf.logical_not(i)) # mixing the label into minibatch elif oversample_ratio > 0.0: ds_unsupervised = ds.filter(lambda i, x: tf.logical_not(i)) # only sampling if not fixed amount of labels per minibatch if not fixed_oversample: ds_supervised = ds.filter(lambda i, x: i) ds_supervised = ds_supervised.repeat( int(np.ceil(n_unlabeled / n_labeled))) ds = tf.data.experimental.sample_from_datasets( [ds_unsupervised, ds_supervised], weights=[1. - oversample_ratio, oversample_ratio], seed=seed) # cache the labeled data for fixed amount of labels per minibatch else: # for some reason sample_from_datasets significantly slowed down # if we sample from a single dataset that splitted into two by # filtering, and one of which is repeated # (e.g. 7000 samples/s dropped down to 1000 samples/s) x_labeled, y_labeled = _extract_labeled_examples( ds, n_labeled=n_labeled, normalize_method=None) mask_labeled = tf.cast(tf.ones([x_labeled.shape[0]]), tf.bool) ds_supervised = tf.data.Dataset.from_tensor_slices( (mask_labeled, (x_labeled, y_labeled))) n_repeat = int( np.ceil(oversample_ratio * n_unlabeled / (1 - oversample_ratio) / n_labeled)) ds_supervised = ds_supervised.shuffle( min(n_labeled, 1000), seed=seed, reshuffle_each_iteration=True, ).repeat(n_repeat) ds = ds_unsupervised # default ratio else: fixed_oversample = False ######## other cases elif task == 'unsupervised': ds = ds.map(lambda *x: (False, x)) elif task == 'supervised': ds = ds.map(lambda *x: (True, x)) else: raise ValueError(f'Unknown task type "{task}".') def _process(mask, data): images = tf.cast(data[0], tf.float32) # normalize the image images = self.normalize(images, normalize) if has_labels: labels = data[1] # covert to one-hot if len(labels.shape) == 1: labels = tf.one_hot(labels, len(self.labels)) # unsupervised task if task == 'unsupervised': return images # supervised task elif task == 'supervised': return images, labels # semi-supervised task if oversample_ratio == 1.0: return images, x_labeled, y_labeled X_sup = tf.boolean_mask(images, mask, 0) y_sup = tf.boolean_mask(labels, mask, 0) X_uns = tf.boolean_mask(images, tf.logical_not(mask), 0) return X_uns, X_sup, y_sup # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(buffer_size=int(shuffle), seed=seed, reshuffle_each_iteration=True) # for mixing unsupervised and supervised data if task == 'semi' and fixed_oversample: if shuffle is not None and shuffle > 0: ds_supervised = ds_supervised.shuffle( buffer_size=int(shuffle), seed=seed, reshuffle_each_iteration=True) if batch_size is not None and batch_size > 0: n_sup = int(np.ceil(batch_size * oversample_ratio)) batch_size = batch_size - n_sup ds_supervised = ds_supervised.batch( n_sup, drop_remainder=drop_remainder) ds = ds.batch(batch_size, drop_remainder=drop_remainder) ds = tf.data.Dataset.zip((ds, ds_supervised)) def merge_semi(uns, sup): m_uns, (x_uns, y_uns) = uns m_sup, (x_sup, y_sup) = sup return tf.concat([m_uns, m_sup], 0), (tf.concat([x_uns, x_sup], 0), tf.concat([y_uns, y_sup], 0)) ds = ds.map(merge_semi) # process as normal elif batch_size is not None and batch_size > 0: ds = ds.batch(batch_size, drop_remainder=drop_remainder) # map cache and prefetch ds = ds.map(_process, num_parallel_calls=parallel) if cache is not None: ds = ds.cache(filename=str(cache)) if prefetch is not None: ds = ds.prefetch(buffer_size=prefetch) ds: tf.data.Dataset return ds