Beispiel #1
0
def dimension_reduction(*x, algo='pca', **kwargs):
  algo = str(algo).lower()
  assert algo in ('pca', 'tsne', 'umap'), \
    "No support for algorithm: '%s'" % algo
  if x[0].shape[1] == 1:
    raise ValueError("No dimension reduction for input with shape: %s" %
                     str(x[0].shape))
  elif x[0].shape[1] == 2:
    pass
  elif algo == 'tsne':
    x = fast_tsne(*x,
                  n_components=2,
                  perplexity=30.0,
                  learning_rate=200,
                  n_iter=1000,
                  random_state=1234,
                  n_jobs=8,
                  **kwargs)
  elif algo == 'pca':
    x = fast_pca(*x, n_components=2, random_state=1234, **kwargs)
  else:
    x = fast_umap(*x, random_state=1234, **kwargs)
  if len(x) == 1:
    return x[0]
  return x
Beispiel #2
0
def extract_pca(p_train, p_test):
    # p_train, p_test : the output and latent distributions
    pca = [
        fast_pca(squeeze(train.mean()), squeeze(test.mean()),
                 n_components=2)[-1] for train, test in zip(p_train, p_test)
        if train.event_shape[0] > 1
    ]
    return pca
Beispiel #3
0
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
  record_valid_loss.append(lo)
  # ====== plotting ====== #
  if args.dim > 2:
    code_samples = ml.fast_pca(code_samples, n_components=2,
                               random_state=K.get_rng().randint(10e8))
  img_samples = f_samples()
  img_mean = f_X(X_valid[:25])

  V.plot_figure(nrow=3, ncol=12)

  ax = plt.subplot(1, 3, 1)
  ax.scatter(code_samples[:, 0], code_samples[:, 1], s=2, c=y_valid, alpha=0.3)
  ax.set_title('Epoch %d' % epoch)
  ax.set_aspect('equal', 'box')
  ax.axis('off')

  ax = plt.subplot(1, 3, 2)
  ax.imshow(V.tile_raster_images(img_samples), cmap=plt.cm.Greys_r)
  ax.axis('off')
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
  record_valid_loss.append(lo)
  # ====== plotting ====== #
  if args.dim > 2:
    code_samples = ml.fast_pca(code_samples, n_components=2,
                               random_state=K.get_rng().randint(10e8))
  img_samples = f_samples()
  img_mean = f_X(X_valid[:25])

  V.plot_figure(nrow=3, ncol=12)

  ax = plt.subplot(1, 3, 1)
  ax.scatter(code_samples[:, 0], code_samples[:, 1], s=2, c=y_valid, alpha=0.3)
  ax.set_title('Epoch %d' % epoch)
  ax.set_aspect('equal', 'box')
  ax.axis('off')

  ax = plt.subplot(1, 3, 2)
  ax.imshow(V.tile_raster_images(img_samples), cmap=plt.cm.Greys_r)
  ax.axis('off')
Beispiel #5
0
                  random_state=5218)
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ====== svm scoring ====== #
 print(ctext("==== '%s'" % "Ivec SVM-scoring", 'cyan'))
 scorer = ml.Scorer(wccn=True, lda=True, method='svm')
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ===========================================================================
 # Super-vector
 # ===========================================================================
 X_train = stats['train'][1]
 X_test = stats['test'][1]
 X_train, X_test = ml.fast_pca(X_train,
                               X_test,
                               n_components=args.tdim,
                               algo='ppca',
                               random_state=5218)
 # ====== GMM scoring ====== #
 print(ctext("==== '%s'" % "Super-Vector GMM-scoring-ova", 'cyan'))
 scorer = ml.GMMclassifier(strategy="ova",
                           n_components=3,
                           covariance_type='full',
                           centering=True,
                           wccn=True,
                           unit_length=True,
                           lda=False,
                           concat=False)
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ====== plda scoring ====== #
Beispiel #6
0
from odin import ml
from odin import visual as vs

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

tf.random.set_seed(8)
np.random.seed(8)

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_umap = ml.fast_umap(X_train, X_test)
X_tsne = ml.fast_tsne(X_train, X_test)
X_pca = ml.fast_pca(X_train, X_test, n_components=2)

styles = dict(size=12, alpha=0.6, centroids=True)

vs.plot_figure(6, 12)
vs.plot_scatter(x=X_pca[0], color=y_train, ax=(1, 2, 1), **styles)
vs.plot_scatter(x=X_pca[1], color=y_test, ax=(1, 2, 2), **styles)

vs.plot_figure(6, 12)
vs.plot_scatter(x=X_tsne[0], color=y_train, ax=(1, 2, 1), **styles)
vs.plot_scatter(x=X_tsne[1], color=y_test, ax=(1, 2, 2), **styles)

vs.plot_figure(6, 12)
vs.plot_scatter(x=X_umap[0], color=y_train, ax=(1, 2, 1), **styles)
vs.plot_scatter(x=X_umap[1], color=y_test, ax=(1, 2, 2), **styles)
def plot_epoch(task):
    if task is None:
        curr_epoch = 0
    else:
        curr_epoch = task.curr_epoch
        if not (curr_epoch < 5 or curr_epoch % 5 == 0):
            return
    rand = np.random.RandomState(seed=1234)

    X, y = X_test, y_test
    n_data = X.shape[0]
    Z = f_z(X)
    W, W_stdev_mcmc, W_stdev_analytic = f_w(X)

    X_pca, W_pca_1 = fast_pca(X,
                              W,
                              n_components=2,
                              random_state=rand.randint(10e8))
    W_pca_2 = fast_pca(W, n_components=2, random_state=rand.randint(10e8))
    X_count_sum = np.sum(X, axis=tuple(range(1, X.ndim)))
    W_count_sum = np.sum(W, axis=-1)

    n_visual_samples = 8
    nrow = 13 + n_visual_samples * 3
    V.plot_figure(nrow=int(nrow * 1.8), ncol=18)
    with V.plot_gridSpec(nrow=nrow + 3, ncol=6, hspace=0.8) as grid:
        # plot the latent space
        for i, (z, name) in enumerate(zip(Z, Z_names)):
            if z.shape[1] > 2:
                z = fast_pca(z,
                             n_components=2,
                             random_state=rand.randint(10e8))
            ax = V.subplot(grid[:3, (i * 2):(i * 2 + 2)])
            V.plot_scatter(x=z[:, 0],
                           y=z[:, 1],
                           color=y,
                           marker=y,
                           n_samples=4000,
                           ax=ax,
                           legend_enable=False,
                           legend_ncol=n_classes)
            ax.set_title(name, fontsize=12)
        # plot the reconstruction
        for i, (x, name) in enumerate(
                zip([X_pca, W_pca_1, W_pca_2], [
                    'Original data', 'Reconstruction',
                    'Reconstruction (separated PCA)'
                ])):
            ax = V.subplot(grid[3:6, (i * 2):(i * 2 + 2)])
            V.plot_scatter(x=x[:, 0],
                           y=x[:, 1],
                           color=y,
                           marker=y,
                           n_samples=4000,
                           ax=ax,
                           legend_enable=i == 1,
                           legend_ncol=n_classes,
                           title=name)
        # plot the reconstruction count sum
        for i, (x, count_sum, name) in enumerate(
                zip([X_pca, W_pca_1], [X_count_sum, W_count_sum], [
                    'Original data (Count-sum)', 'Reconstruction (Count-sum)'
                ])):
            ax = V.subplot(grid[6:10, (i * 3):(i * 3 + 3)])
            V.plot_scatter(x=x[:, 0],
                           y=x[:, 1],
                           val=count_sum,
                           n_samples=2000,
                           marker=y,
                           ax=ax,
                           size=8,
                           legend_enable=i == 0,
                           legend_ncol=n_classes,
                           title=name,
                           colorbar=True,
                           fontsize=10)
        # plot the count-sum series
        count_sum_observed = np.sum(X, axis=0).ravel()
        count_sum_expected = np.sum(W, axis=0)
        count_sum_stdev_explained = np.sum(W_stdev_mcmc, axis=0)
        count_sum_stdev_total = np.sum(W_stdev_analytic, axis=0)
        for i, kws in enumerate([
                dict(xscale='linear', yscale='linear', sort_by=None),
                dict(xscale='linear', yscale='linear', sort_by='expected'),
                dict(xscale='log', yscale='log', sort_by='expected')
        ]):
            ax = V.subplot(grid[10:10 + 3, (i * 2):(i * 2 + 2)])
            V.plot_series_statistics(count_sum_observed,
                                     count_sum_expected,
                                     explained_stdev=count_sum_stdev_explained,
                                     total_stdev=count_sum_stdev_total,
                                     fontsize=8,
                                     title="Count-sum" if i == 0 else None,
                                     **kws)
        # plot the mean and variances
        curr_grid_index = 13
        ids = rand.permutation(n_data)
        ids = ids[:n_visual_samples]
        for i in ids:
            observed, expected, stdev_explained, stdev_total = \
                X[i], W[i], W_stdev_mcmc[i], W_stdev_analytic[i]
            observed = observed.ravel()
            for j, kws in enumerate([
                    dict(xscale='linear', yscale='linear', sort_by=None),
                    dict(xscale='linear', yscale='linear', sort_by='expected'),
                    dict(xscale='log', yscale='log', sort_by='expected')
            ]):
                ax = V.subplot(grid[curr_grid_index:curr_grid_index + 3,
                                    (j * 2):(j * 2 + 2)])
                V.plot_series_statistics(observed,
                                         expected,
                                         explained_stdev=stdev_explained,
                                         total_stdev=stdev_total,
                                         fontsize=8,
                                         title="Test Sample #%d" %
                                         i if j == 0 else None,
                                         **kws)
            curr_grid_index += 3
    V.plot_save(os.path.join(FIGURE_PATH, 'latent_%d.png' % curr_epoch),
                dpi=200,
                log=True)
    exit()
Beispiel #8
0
    def plot_imputation_scatter(self,
                                test=True,
                                pca=False,
                                color_by_library=True):
        start_time = time.time()
        n_system = len(self) + 2  # add the original and the corrupted
        data_type = 'test' if test else 'train'

        if n_system <= 5:
            nrow = 1
            ncol = n_system
        else:
            nrow = 2
            ncol = int(np.ceil(n_system / 2))

        X_org = self.posteriors[0].X_test_org if test else self.posteriors[
            0].X_train_org
        X_crr = self.posteriors[0].X_test if test else self.posteriors[
            0].X_train
        y = self.posteriors[0].y_test if test else self.posteriors[0].y_train
        labels = self.posteriors[0].labels
        is_binary_classes = self.posteriors[0].is_binary_classes
        allV = [X_org, X_crr] + [
            pos.V_test if test else pos.V_train for pos in self.posteriors
        ]
        assert X_org.shape == X_crr.shape and all(v.shape == X_org.shape
                                                  for v in allV)
        all_names = ["[%s]Original" % data_type,
                     "[%s]Corrupted" % data_type
                     ] + [i.short_id_lines for i in self.posteriors]

        # log-normalize everything
        if len(X_org) > 5000:
            np.random.seed(5218)
            ids = np.random.permutation(X_org.shape[0])[:5000]
            allV = [v[ids] for v in allV]
            y = y[ids]

        if is_binary_classes:
            y = np.argmax(y, axis=-1)
        else:
            y = ProbabilisticEmbedding().fit_transform(y)
            y = np.argmax(y, axis=-1)

        allV = [log_norm(v) for v in allV]

        fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5))
        for idx, (name, v) in enumerate(zip(all_names, allV)):
            ax = plt.subplot(nrow, ncol, idx + 1)
            n = np.sum(v, axis=-1)
            v = fast_pca(v, n_components=2) if pca else fast_tsne(
                v, n_components=2)
            with catch_warnings_ignore(Warning):
                if color_by_library:
                    plot_scatter(x=v,
                                 val=n,
                                 ax=ax,
                                 size=8,
                                 legend_enable=False,
                                 grid=False,
                                 title=name)
                else:
                    plot_scatter(x=v,
                                 color=[labels[i] for i in y],
                                 marker=[labels[i] for i in y],
                                 ax=ax,
                                 size=8,
                                 legend_enable=True if idx == 0 else False,
                                 grid=False,
                                 title=name)

        with catch_warnings_ignore(Warning):
            plt.tight_layout()
        self.add_figure(
            'imputation_scatter_%s_%s' %
            ('lib' if color_by_library else 'cell', data_type), fig)
        return self._log(
            'plot_imputation_scatter[%s] %s(s)' %
            (data_type, ctext(time.time() - start_time, 'lightyellow')))
Beispiel #9
0
 scorer = ml.PLDA(n_phi=TV_DIM // 2, n_iter=12,
                  centering=True, wccn=True, unit_length=True,
                  random_state=5218)
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ====== svm scoring ====== #
 print(ctext("==== '%s'" % "Ivec SVM-scoring", 'cyan'))
 scorer = ml.Scorer(wccn=True, lda=True, method='svm')
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ===========================================================================
 # Super-vector
 # ===========================================================================
 X_train = stats['train'][1]
 X_test = stats['test'][1]
 X_train, X_test = ml.fast_pca(X_train, X_test, n_components=args.tdim,
                               algo='ppca', random_state=5218)
 # ====== GMM scoring ====== #
 print(ctext("==== '%s'" % "Super-Vector GMM-scoring-ova", 'cyan'))
 scorer = ml.GMMclassifier(strategy="ova",
                           n_components=3, covariance_type='full',
                           centering=True, wccn=True, unit_length=True,
                           lda=False, concat=False)
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)
 # ====== plda scoring ====== #
 print(ctext("==== '%s'" % "Super-Vector PLDA-scoring", 'cyan'))
 scorer = ml.PLDA(n_phi=TV_DIM // 2, n_iter=12,
                  centering=True, wccn=True, unit_length=True,
                  random_state=5218)
 scorer.fit(X=X_train, y=y_true['train'])
 scorer.evaluate(X_test, y_true['test'], labels=labels)