def transform(self, X, n_components=None): # ====== check number of components ====== # # specified percentage of explained variance if n_components is not None: # percentage of variances if n_components < 1.: _ = np.cumsum(self.explained_variance_ratio_) n_components = (_ > n_components).nonzero()[0][0] + 1 # specific number of components else: n_components = int(n_components) # ====== other info ====== # n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size # ====== start transforming ====== # X_transformed = [] for start, end in minibatch(n=n, batch_size=batch_size): x = super(MiniBatchPCA, self).transform(X=X[start:end]) if n_components is not None: x = x[:, :n_components] X_transformed.append(x) return np.concatenate(X_transformed, axis=0)
def is_binary(x: np.ndarray): r""" A binary array only contain 0 or 1 """ for s, e in minibatch(batch_size=1024, n=len(x)): y = x[s:e] if np.all(np.unique(y) != (0., 1.)): return False return True
def is_discrete(x: np.ndarray): r""" A discrete array contain only integer values """ if not isinstance(x.dtype, np.integer): for s, e in minibatch(batch_size=1024, n=len(x)): y = x[s:e] if np.any(y.astype(np.int32) != y.astype(np.float32)): return False return True
def sparsity_percentage(x, batch_size=1024): n_zeros = 0 n_total = np.prod(x.shape) for start, end in minibatch(batch_size=batch_size, n=x.shape[0], seed=None): y = x[start:end] if hasattr(y, 'count_nonzero'): n_nonzeros = y.count_nonzero() else: n_nonzeros = np.count_nonzero(y) n_zeros += np.prod(y.shape) - n_nonzeros return n_zeros / n_total
def make_dnn_prediction(functions, X, batch_size=256, title=''): return_list = True if not isinstance(functions, (tuple, list)): functions = [functions] return_list = False n_functions = len(functions) results = [[] for i in range(n_functions)] # ====== prepare progress bar ====== # n_samples = len(X) prog = Progbar(target=n_samples, print_summary=True, name="Making prediction: %s" % str(title)) # ====== for feeder ====== # if isinstance(X, F.Feeder): y_true = [] for x, y in X.set_batch(batch_size=batch_size): for res, fn in zip(results, functions): res.append(fn(x)) prog.add(x.shape[0]) y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y) results = [np.concatenate(res, axis=0) for res in results] y_true = np.concatenate(y_true, axis=0) if return_list: return results, y_true return results[0], y_true # ====== for numpy array ====== # else: for start, end in minibatch(batch_size=batch_size, n=n_samples): y = X[start:end] for res, fn in zip(results, functions): res.append(fn(y)) prog.add(end - start) results = [np.concatenate(res, axis=0) for res in results] if return_list: return results return results[0]
def evaluate(model, ds, args): test = ds.create_dataset('test', batch_size=32) # === 1. marginalized llk n_mcmc = 100 llk = [] kl = [] for x in tqdm(test.take(10)): qz = model.encode(x, training=False) batch_llk = [] for s, e in minibatch(8, n_mcmc): n = e - s z = qz.sample(n) z = tf.reshape(z, (-1, z.shape[-1])) px = model.decode(z, training=False) # llk batch_llk.append( tf.reshape(px.log_prob(tf.tile(x, (n, 1, 1, 1))), (n, -1))) # kl exit() batch_llk = tf.concat(batch_llk, 0) llk.append(batch_llk) llk = tf.concat(llk, axis=-1) print(llk.shape) print('LLK:', tf.reduce_mean(tf.reduce_logsumexp(llk, 0)))
def fast_kmeans( X, *, n_clusters: int = 8, max_iter: int = 300, tol: float = 0.0001, n_init: int = 10, random_state: int = 1, init: Literal['scalable-kmeans++', 'k-means||', 'random'] = 'scalable-k-means++', oversampling_factor: float = 2.0, max_samples_per_batch: int = 32768, framework: Literal['auto', 'cuml', 'sklearn'] = 'auto', ) -> MiniBatchKMeans: """KMeans clustering Parameters ---------- n_clusters : int (default = 8) The number of centroids or clusters you want. max_iter : int (default = 300) The more iterations of EM, the more accurate, but slower. tol : float64 (default = 1e-4) Stopping criterion when centroid means do not change much. random_state : int (default = 1) If you want results to be the same when you restart Python, select a state. init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} (default = 'scalable-k-means++') 'scalable-k-means++' or 'k-means||': Uses fast and stable scalable kmeans++ intialization. 'random': Choose 'n_cluster' observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. max_samples_per_batch : int maximum number of samples to use for each batch of the pairwise distance computation. oversampling_factor : int (default = 2) The amount of points to sample in scalable k-means++ initialization for potential centroids. Increasing this value can lead to better initial centroids at the cost of memory. The total number of centroids sampled in scalable k-means++ is oversampling_factor * n_clusters * 8. max_samples_per_batch : int (default = 32768) The number of data samples to use for batches of the pairwise distance computation. This computation is done throughout both fit predict. The default should suit most cases. The total number of elements in the batched pairwise distance computation is max_samples_per_batch * n_clusters. It might become necessary to lower this number when n_clusters becomes prohibitively large. """ kwargs = dict(locals()) X = kwargs.pop('X') kwargs.pop('framework') ## fine-tuning the kwargs cuml = _check_cuml(framework) if cuml: from cuml.cluster import KMeans kwargs.pop('n_init') else: kwargs.pop('oversampling_factor') kwargs.pop('max_samples_per_batch') if kwargs['init'] in ('scalable-k-means++', 'k-means||'): kwargs['init'] = 'k-means++' ## fitting if not cuml: from odin.utils import minibatch kmean = MiniBatchKMeans(**kwargs) for s, e in minibatch(int(max_samples_per_batch), n=X.shape[0], seed=random_state): kmean.partial_fit(X[s:e]) else: kmean = KMeans(verbose=False, **kwargs) kmean.fit(X) return kmean
def fast_pca( *x, n_components: Optional[int] = None, algo: Literal['pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'] = 'pca', y=None, batch_size: int = 1024, return_model: bool = False, random_state: int = 1, ): r""" A shortcut for many different PCA algorithms Arguments: x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD 'pca' - Normal PCA y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ try: from cuml.decomposition import PCA as cuPCA except ImportError: cuPCA = None batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError("`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1,) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': if x_train.shape[1] > 1000 and x_train.shape[0] > 1e5 and cuPCA is not None: pca = cuPCA(n_components=n_components, random_state=random_state) else: pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in minibatch(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) if return_model: return tuple([pca, x_train] + x_test) del pca return tuple([x_train] + x_test)
def evaluate(vae: VariationalAutoencoder, ds: ImageDataset, expdir: str, title: str, batch_size: int = 64, take_count: int = -1, n_images: int = 36, seed: int = 1): n_rows = int(np.sqrt(n_images)) is_semi = vae.is_semi_supervised() is_hierarchical = vae.is_hierarchical() ds_kw = dict(batch_size=batch_size, label_percent=1.0, shuffle=False) ## prepare rand = np.random.RandomState(seed=seed) if not os.path.exists(expdir): os.makedirs(expdir) ## data for training semi-supervised train = ds.create_dataset('train', **ds_kw) (llkx_train, llky_train, x_org_train, x_rec_train, y_true_train, y_pred_train, z_train, pz_train) = _call(vae, ds=train, rand=rand, take_count=take_count, n_images=n_images, verbose=True) ## data for testing test = ds.create_dataset('test', **ds_kw) (llkx_test, llky_test, x_org_test, x_rec_test, y_true_test, y_pred_test, z_test, pz_test) = _call(vae, ds=test, rand=rand, take_count=take_count, n_images=n_images, verbose=True) # === 0. plotting latent-factor pairs for idx, z in enumerate(z_test): z = z.mean() f = y_true_test corr_mat = Correlation.Spearman(z, f) # [n_latents, n_factors] plot_latents_pairs(z, f, corr_mat, ds.labels) vs.plot_save(f'{expdir}/latent{idx}_factor.pdf', dpi=100, verbose=True) # === 0. latent traverse plot x_travs = x_org_test if x_travs.ndim == 3: # grayscale image x_travs = np.expand_dims(x_travs, -1) else: # color image x_travs = np.transpose(x_travs, (0, 2, 3, 1)) x_travs = x_travs[rand.permutation(x_travs.shape[0])] n_visual_samples = 5 n_traverse_points = 21 n_top_latents = 10 plt.figure(figsize=(8, 3 * n_visual_samples)) for i in range(n_visual_samples): images = vae.sample_traverse(x_travs[i:i + 1], min_val=-np.min(z_test[0].mean()), max_val=np.max(z_test[0].mean()), n_best_latents=n_top_latents, n_traverse_points=n_traverse_points, mode='linear') images = as_tuple(images)[0] images = _prepare_images(images.mean().numpy(), normalize=True) vs.plot_images(images, grids=(n_top_latents, n_traverse_points), ax=(n_visual_samples, 1, i + 1)) if i == 0: plt.title('Latents traverse') plt.tight_layout() vs.plot_save(f'{expdir}/latents_traverse.pdf', dpi=180, verbose=True) # === 0. prior sampling plot images = as_tuple(vae.sample_observation(n=n_images, seed=seed))[0] images = _prepare_images(images.mean().numpy(), normalize=True) plt.figure(figsize=(5, 5)) vs.plot_images(images, grids=(n_rows, n_rows), title='Sampled') # === 1. reconstruction plot plt.figure(figsize=(15, 15)) vs.plot_images(x_org_train, grids=(n_rows, n_rows), ax=(2, 2, 1), title='[Train]Original') vs.plot_images(x_rec_train, grids=(n_rows, n_rows), ax=(2, 2, 2), title='[Train]Reconstructed') vs.plot_images(x_org_test, grids=(n_rows, n_rows), ax=(2, 2, 3), title='[Test]Original') vs.plot_images(x_rec_test, grids=(n_rows, n_rows), ax=(2, 2, 4), title='[Test]Reconstructed') plt.tight_layout() ## prepare the labels label_type = ds.label_type if label_type == 'categorical': labels_name = ds.labels true = np.argmax(y_true_test, axis=-1) labels_true = np.array([labels_name[i] for i in true]) labels_pred = labels_true if is_semi: pred = np.argmax(y_pred_test.mean().numpy(), axis=-1) labels_pred = np.array([labels_name[i] for i in pred]) elif label_type == 'factor': # dsprites, shapes3d labels_name = ['cube', 'cylinder', 'sphere', 'round'] \ if 'shapes3d' in ds.name else ['square', 'ellipse', 'heart'] true = y_true_test[:, 2].astype('int32') labels_true = np.array([labels_name[i] for i in true]) labels_pred = labels_true if is_semi: pred = get_ymean(y_pred_test)[:, 2].astype('int32') labels_pred = np.array([labels_name[i] for i in pred]) else: # CelebA raise NotImplementedError ## confusion matrix if is_semi: plt.figure(figsize=(8, 8)) acc = accuracy_score(y_true=true, y_pred=pred) vs.plot_confusion_matrix(cm=confusion_matrix(y_true=true, y_pred=pred), labels=labels_name, cbar=True, fontsize=10, title=f'{title} Acc:{acc:.2f}') ## save arrays for later inspections with open(f'{expdir}/arrays', 'wb') as f: pickle.dump( dict(z_train=z_train, y_pred_train=y_pred_train, y_true_train=y_true_train, z_test=z_test, y_pred_test=y_pred_test, y_true_test=y_true_test, labels=labels_name, ds=ds.name, label_type=label_type), f) print(f'Exported arrays to "{expdir}/arrays"') ## semi-supervised z_mean_train = np.concatenate( [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_train], -1) z_mean_test = np.concatenate( [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_test], -1) # === 2. scatter points latents plot n_points = 5000 ids = rand.permutation(len(labels_true))[:n_points] Y_true = labels_true[ids] Y_pred = labels_pred[ids] # tsne plot n_latents = 0 if len(z_train) == 1 else len(z_train) for name, X in zip( ['all'] + [f'latents{i}' for i in range(n_latents)], [z_mean_test[ids]] + [z_test[i].mean().numpy()[ids] for i in range(n_latents)]): print(f'Plot scatter points for {name}') X = X.reshape(X.shape[0], -1) # flatten to 2D X = Pipeline([('zscore', StandardScaler()), ('pca', PCA(min(X.shape[1], 512), random_state=seed))]).fit_transform(X) tsne = DimReduce.TSNE(X, n_components=2, framework='sklearn') kw = dict(x=tsne[:, 0], y=tsne[:, 1], grid=False, size=12.0, alpha=0.8) plt.figure(figsize=(12, 6)) vs.plot_scatter(color=Y_true, title=f'[True]{title}-{name}', ax=(1, 2, 1), **kw) vs.plot_scatter(color=Y_pred, title=f'[Pred]{title}-{name}', ax=(1, 2, 2), **kw) ## save all plot vs.plot_save(f'{expdir}/analysis.pdf', dpi=180, verbose=True) # === 3. show the latents statistics n_latents = len(z_train) colors = sns.color_palette(n_colors=len(labels_true)) styles = dict(grid=False, ticks_off=False, alpha=0.6, xlabel='mean', ylabel='stddev') # scatter between latents and labels (assume categorical distribution) def _show_latents_labels(Z, Y, title): plt.figure(figsize=(5 * n_latents, 5), dpi=150) for idx, z in enumerate(Z): if len(z.batch_shape) == 0: mean = np.repeat(np.expand_dims(z.mean(), 0), Y.shape[0], 0) stddev = z.sample(Y.shape[0]) - mean else: mean = flatten(z.mean()) stddev = flatten(z.stddev()) y = np.argmax(Y, axis=-1) data = [[], [], []] for y_i, c in zip(np.unique(y), colors): mask = (y == y_i) data[0].append(np.mean(mean[mask], 0)) data[1].append(np.mean(stddev[mask], 0)) data[2].append([labels_true[y_i]] * mean.shape[1]) vs.plot_scatter( x=np.concatenate(data[0], 0), y=np.concatenate(data[1], 0), color=np.concatenate(data[2], 0), ax=(1, n_latents, idx + 1), size=15 if mean.shape[1] < 128 else 8, title=f'[Test-{title}]#{idx} - {mean.shape[1]} (units)', **styles) plt.tight_layout() # simple scatter mean-stddev each latents def _show_latents(Z, title): plt.figure(figsize=(3.5 * n_latents, 3.5), dpi=150) for idx, z in enumerate(Z): mean = flatten(z.mean()) stddev = flatten(z.stddev()) if mean.ndim == 2: mean = np.mean(mean, 0) stddev = np.mean(stddev, 0) vs.plot_scatter( x=mean, y=stddev, ax=(1, n_latents, idx + 1), size=15 if len(mean) < 128 else 8, title=f'[Test-{title}]#{idx} - {len(mean)} (units)', **styles) _show_latents_labels(z_test, y_true_test, 'post') _show_latents_labels(pz_test, y_true_test, 'prior') _show_latents(z_test, 'post') _show_latents(pz_test, 'prior') # KL statistics vs.plot_figure() for idx, (qz, pz) in enumerate(zip(z_test, pz_test)): kl = [] qz = Normal(loc=qz.mean(), scale=qz.stddev(), name=f'posterior{idx}') pz = Normal(loc=pz.mean(), scale=pz.stddev(), name=f'prior{idx}') for s, e in minibatch(batch_size=8, n=100): z = qz.sample(e - s) # don't do this in GPU, it explodes! kl.append((qz.log_prob(z) - pz.log_prob(z)).numpy()) kl = np.concatenate(kl, 0) # (mcmc, batch, event) # per sample kl_samples = np.sum(kl, as_tuple(list(range(2, kl.ndim)))) kl_samples = logsumexp(kl_samples, 0) plt.subplot(n_latents, 2, idx * 2 + 1) sns.histplot(kl_samples, bins=50) plt.title(f'Z#{idx} KL per sample (nats)') # per latent kl_latents = np.mean(flatten(logsumexp(kl, 0)), 0) plt.subplot(n_latents, 2, idx * 2 + 2) plt.plot(np.sort(kl_latents)) plt.title(f'Z#{idx} KL per dim (nats)') plt.tight_layout() vs.plot_save(f'{expdir}/latents.pdf', dpi=180, verbose=True)
# =========================================================================== update_ops = K.optimizers.Adam(lr=0.001).minimize(loss) K.initialize_all_variables() # ====== intitalize ====== # record_train_loss = [] record_valid_loss = [] patience = 3 epoch = 0 # We want the rate to go up but the distortion to go down while True: # ====== training ====== # train_losses = [] prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch) start_time = timeit.default_timer() for start, end in minibatch(batch_size=args.bs, n=X_train.shape[0], seed=K.get_rng().randint(10e8)): _ = K.eval(loss, feed_dict={X: X_train[start:end]}, update_after=update_ops) prog.add(end - start) train_losses.append(_) # ====== training log ====== # print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time)) print("[Training set] Loss: %.4f" % np.mean(train_losses)) # ====== validation set ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) print("[Valid set] Loss: %.4f" % lo) # ====== record the history ====== # record_train_loss.append(np.mean(train_losses))