def dimension_reduction(*x, algo='pca', **kwargs): algo = str(algo).lower() assert algo in ('pca', 'tsne', 'umap'), \ "No support for algorithm: '%s'" % algo if x[0].shape[1] == 1: raise ValueError("No dimension reduction for input with shape: %s" % str(x[0].shape)) elif x[0].shape[1] == 2: pass elif algo == 'tsne': x = fast_tsne(*x, n_components=2, perplexity=30.0, learning_rate=200, n_iter=1000, random_state=1234, n_jobs=8, **kwargs) elif algo == 'pca': x = fast_pca(*x, n_components=2, random_state=1234, **kwargs) else: x = fast_umap(*x, random_state=1234, **kwargs) if len(x) == 1: return x[0] return x
def extract_pca(p_train, p_test): # p_train, p_test : the output and latent distributions pca = [ fast_pca(squeeze(train.mean()), squeeze(test.mean()), n_components=2)[-1] for train, test in zip(p_train, p_test) if train.event_shape[0] > 1 ] return pca
_ = K.eval(loss, feed_dict={X: X_train[start:end]}, update_after=update_ops) prog.add(end - start) train_losses.append(_) # ====== training log ====== # print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time)) print("[Training set] Loss: %.4f" % np.mean(train_losses)) # ====== validation set ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) print("[Valid set] Loss: %.4f" % lo) # ====== record the history ====== # record_train_loss.append(np.mean(train_losses)) record_valid_loss.append(lo) # ====== plotting ====== # if args.dim > 2: code_samples = ml.fast_pca(code_samples, n_components=2, random_state=K.get_rng().randint(10e8)) img_samples = f_samples() img_mean = f_X(X_valid[:25]) V.plot_figure(nrow=3, ncol=12) ax = plt.subplot(1, 3, 1) ax.scatter(code_samples[:, 0], code_samples[:, 1], s=2, c=y_valid, alpha=0.3) ax.set_title('Epoch %d' % epoch) ax.set_aspect('equal', 'box') ax.axis('off') ax = plt.subplot(1, 3, 2) ax.imshow(V.tile_raster_images(img_samples), cmap=plt.cm.Greys_r) ax.axis('off')
random_state=5218) scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # ====== svm scoring ====== # print(ctext("==== '%s'" % "Ivec SVM-scoring", 'cyan')) scorer = ml.Scorer(wccn=True, lda=True, method='svm') scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # =========================================================================== # Super-vector # =========================================================================== X_train = stats['train'][1] X_test = stats['test'][1] X_train, X_test = ml.fast_pca(X_train, X_test, n_components=args.tdim, algo='ppca', random_state=5218) # ====== GMM scoring ====== # print(ctext("==== '%s'" % "Super-Vector GMM-scoring-ova", 'cyan')) scorer = ml.GMMclassifier(strategy="ova", n_components=3, covariance_type='full', centering=True, wccn=True, unit_length=True, lda=False, concat=False) scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # ====== plda scoring ====== #
from odin import ml from odin import visual as vs os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' tf.random.set_seed(8) np.random.seed(8) X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_umap = ml.fast_umap(X_train, X_test) X_tsne = ml.fast_tsne(X_train, X_test) X_pca = ml.fast_pca(X_train, X_test, n_components=2) styles = dict(size=12, alpha=0.6, centroids=True) vs.plot_figure(6, 12) vs.plot_scatter(x=X_pca[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_pca[1], color=y_test, ax=(1, 2, 2), **styles) vs.plot_figure(6, 12) vs.plot_scatter(x=X_tsne[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_tsne[1], color=y_test, ax=(1, 2, 2), **styles) vs.plot_figure(6, 12) vs.plot_scatter(x=X_umap[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_umap[1], color=y_test, ax=(1, 2, 2), **styles)
def plot_epoch(task): if task is None: curr_epoch = 0 else: curr_epoch = task.curr_epoch if not (curr_epoch < 5 or curr_epoch % 5 == 0): return rand = np.random.RandomState(seed=1234) X, y = X_test, y_test n_data = X.shape[0] Z = f_z(X) W, W_stdev_mcmc, W_stdev_analytic = f_w(X) X_pca, W_pca_1 = fast_pca(X, W, n_components=2, random_state=rand.randint(10e8)) W_pca_2 = fast_pca(W, n_components=2, random_state=rand.randint(10e8)) X_count_sum = np.sum(X, axis=tuple(range(1, X.ndim))) W_count_sum = np.sum(W, axis=-1) n_visual_samples = 8 nrow = 13 + n_visual_samples * 3 V.plot_figure(nrow=int(nrow * 1.8), ncol=18) with V.plot_gridSpec(nrow=nrow + 3, ncol=6, hspace=0.8) as grid: # plot the latent space for i, (z, name) in enumerate(zip(Z, Z_names)): if z.shape[1] > 2: z = fast_pca(z, n_components=2, random_state=rand.randint(10e8)) ax = V.subplot(grid[:3, (i * 2):(i * 2 + 2)]) V.plot_scatter(x=z[:, 0], y=z[:, 1], color=y, marker=y, n_samples=4000, ax=ax, legend_enable=False, legend_ncol=n_classes) ax.set_title(name, fontsize=12) # plot the reconstruction for i, (x, name) in enumerate( zip([X_pca, W_pca_1, W_pca_2], [ 'Original data', 'Reconstruction', 'Reconstruction (separated PCA)' ])): ax = V.subplot(grid[3:6, (i * 2):(i * 2 + 2)]) V.plot_scatter(x=x[:, 0], y=x[:, 1], color=y, marker=y, n_samples=4000, ax=ax, legend_enable=i == 1, legend_ncol=n_classes, title=name) # plot the reconstruction count sum for i, (x, count_sum, name) in enumerate( zip([X_pca, W_pca_1], [X_count_sum, W_count_sum], [ 'Original data (Count-sum)', 'Reconstruction (Count-sum)' ])): ax = V.subplot(grid[6:10, (i * 3):(i * 3 + 3)]) V.plot_scatter(x=x[:, 0], y=x[:, 1], val=count_sum, n_samples=2000, marker=y, ax=ax, size=8, legend_enable=i == 0, legend_ncol=n_classes, title=name, colorbar=True, fontsize=10) # plot the count-sum series count_sum_observed = np.sum(X, axis=0).ravel() count_sum_expected = np.sum(W, axis=0) count_sum_stdev_explained = np.sum(W_stdev_mcmc, axis=0) count_sum_stdev_total = np.sum(W_stdev_analytic, axis=0) for i, kws in enumerate([ dict(xscale='linear', yscale='linear', sort_by=None), dict(xscale='linear', yscale='linear', sort_by='expected'), dict(xscale='log', yscale='log', sort_by='expected') ]): ax = V.subplot(grid[10:10 + 3, (i * 2):(i * 2 + 2)]) V.plot_series_statistics(count_sum_observed, count_sum_expected, explained_stdev=count_sum_stdev_explained, total_stdev=count_sum_stdev_total, fontsize=8, title="Count-sum" if i == 0 else None, **kws) # plot the mean and variances curr_grid_index = 13 ids = rand.permutation(n_data) ids = ids[:n_visual_samples] for i in ids: observed, expected, stdev_explained, stdev_total = \ X[i], W[i], W_stdev_mcmc[i], W_stdev_analytic[i] observed = observed.ravel() for j, kws in enumerate([ dict(xscale='linear', yscale='linear', sort_by=None), dict(xscale='linear', yscale='linear', sort_by='expected'), dict(xscale='log', yscale='log', sort_by='expected') ]): ax = V.subplot(grid[curr_grid_index:curr_grid_index + 3, (j * 2):(j * 2 + 2)]) V.plot_series_statistics(observed, expected, explained_stdev=stdev_explained, total_stdev=stdev_total, fontsize=8, title="Test Sample #%d" % i if j == 0 else None, **kws) curr_grid_index += 3 V.plot_save(os.path.join(FIGURE_PATH, 'latent_%d.png' % curr_epoch), dpi=200, log=True) exit()
def plot_imputation_scatter(self, test=True, pca=False, color_by_library=True): start_time = time.time() n_system = len(self) + 2 # add the original and the corrupted data_type = 'test' if test else 'train' if n_system <= 5: nrow = 1 ncol = n_system else: nrow = 2 ncol = int(np.ceil(n_system / 2)) X_org = self.posteriors[0].X_test_org if test else self.posteriors[ 0].X_train_org X_crr = self.posteriors[0].X_test if test else self.posteriors[ 0].X_train y = self.posteriors[0].y_test if test else self.posteriors[0].y_train labels = self.posteriors[0].labels is_binary_classes = self.posteriors[0].is_binary_classes allV = [X_org, X_crr] + [ pos.V_test if test else pos.V_train for pos in self.posteriors ] assert X_org.shape == X_crr.shape and all(v.shape == X_org.shape for v in allV) all_names = ["[%s]Original" % data_type, "[%s]Corrupted" % data_type ] + [i.short_id_lines for i in self.posteriors] # log-normalize everything if len(X_org) > 5000: np.random.seed(5218) ids = np.random.permutation(X_org.shape[0])[:5000] allV = [v[ids] for v in allV] y = y[ids] if is_binary_classes: y = np.argmax(y, axis=-1) else: y = ProbabilisticEmbedding().fit_transform(y) y = np.argmax(y, axis=-1) allV = [log_norm(v) for v in allV] fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5)) for idx, (name, v) in enumerate(zip(all_names, allV)): ax = plt.subplot(nrow, ncol, idx + 1) n = np.sum(v, axis=-1) v = fast_pca(v, n_components=2) if pca else fast_tsne( v, n_components=2) with catch_warnings_ignore(Warning): if color_by_library: plot_scatter(x=v, val=n, ax=ax, size=8, legend_enable=False, grid=False, title=name) else: plot_scatter(x=v, color=[labels[i] for i in y], marker=[labels[i] for i in y], ax=ax, size=8, legend_enable=True if idx == 0 else False, grid=False, title=name) with catch_warnings_ignore(Warning): plt.tight_layout() self.add_figure( 'imputation_scatter_%s_%s' % ('lib' if color_by_library else 'cell', data_type), fig) return self._log( 'plot_imputation_scatter[%s] %s(s)' % (data_type, ctext(time.time() - start_time, 'lightyellow')))
scorer = ml.PLDA(n_phi=TV_DIM // 2, n_iter=12, centering=True, wccn=True, unit_length=True, random_state=5218) scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # ====== svm scoring ====== # print(ctext("==== '%s'" % "Ivec SVM-scoring", 'cyan')) scorer = ml.Scorer(wccn=True, lda=True, method='svm') scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # =========================================================================== # Super-vector # =========================================================================== X_train = stats['train'][1] X_test = stats['test'][1] X_train, X_test = ml.fast_pca(X_train, X_test, n_components=args.tdim, algo='ppca', random_state=5218) # ====== GMM scoring ====== # print(ctext("==== '%s'" % "Super-Vector GMM-scoring-ova", 'cyan')) scorer = ml.GMMclassifier(strategy="ova", n_components=3, covariance_type='full', centering=True, wccn=True, unit_length=True, lda=False, concat=False) scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels) # ====== plda scoring ====== # print(ctext("==== '%s'" % "Super-Vector PLDA-scoring", 'cyan')) scorer = ml.PLDA(n_phi=TV_DIM // 2, n_iter=12, centering=True, wccn=True, unit_length=True, random_state=5218) scorer.fit(X=X_train, y=y_true['train']) scorer.evaluate(X_test, y_true['test'], labels=labels)