コード例 #1
0
    def plot_latents_binary_scatter(self, test=True, pca=False):
        start_time = time.time()
        data_type = 'test' if test else 'train'

        if len(self) <= 4:
            nrow = 1
            ncol = len(self)
        else:
            nrow = 2
            ncol = int(np.ceil(len(self) / 2))

        fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5))

        for idx, pos in enumerate(self):
            ax = plt.subplot(nrow, ncol, idx + 1)
            with catch_warnings_ignore(Warning):
                pos.plot_latents_binary_scatter(
                    test=test,
                    ax=ax,
                    legend=True if idx == 0 else False,
                    pca=pca)

        with catch_warnings_ignore(Warning):
            plt.tight_layout()
        self.add_figure('latents_scatter_%s' % data_type, fig)
        return self._log(
            'plot_latents_binary_scatter[%s] %s(s)' %
            (data_type, ctext(time.time() - start_time, 'lightyellow')))
コード例 #2
0
ファイル: __init__.py プロジェクト: trungnt13/sisua
def get_dataset(dataset_name, override=False, verbose=True) -> SingleCellOMIC:
  r""" Check `get_dataset_meta` for more information

  List of all dataset available: ['call', 'callall', 'mpal', 'mpalall',
    'mpalatac', '100yo', '8klyall', '8kmyall', '8kly', '8kmy', '8k',
    '8kall', 'ecclyall', 'eccly', 'eccmyall', 'eccmy', 'ecc', 'eccall',
    '8kx', '8kxall', 'eccx', 'eccxall', 'vdj1x', 'vdj1xall', 'vdj4x',
    'vdj4xall', 'mpalx', 'mpalxall', 'callx', 'callxall', 'pbmcciteseq',
    'cbmcciteseq', 'pbmc5000', 'facs7', 'facs5', 'facs2', 'pbmcscvi',
    'cortex', 'retina', 'hemato', 'vdj1', 'vdj1all', 'vdj2', 'vdj2all',
    'vdj3', 'vdj3all', 'vdj4', 'vdj4all', 'vdjhs3', 'vdjhs3all', 'vdjhs4',
    'vdjhs4all', 'neuron10k', 'neuron10kall', 'heart10k', 'heart10kall',
    'memoryt', 'memorytall', 'naivet', 'naivetall', 'regulatoryt',
    'regulatorytall', 'cd4t', 'cd4tall', '5k', '5kall', '18k', '18kall',
    '4k', '4kall', '10k', '10kall']

  Return:
    mRNA data : `SingleCellOMIC`
    label data: `SingleCellOMIC`. If label data is not availabel, then None

  Example:
    gene, prot = get_dataset("cortex")
    X_train, X_test = gene.split(0.8, seed=1234)
    y_train, y_test = prot.split(0.8, seed=1234)
    X_train.assert_matching_cells(y_train)
    X_test.assert_matching_cells(y_test)
  """
  data_meta = get_dataset_meta()
  # ====== special case: get all dataset ====== #
  dataset_name = str(dataset_name).lower().strip()
  if dataset_name not in data_meta:
    raise RuntimeError(
        'Cannot find dataset with name: "%s", all dataset include: %s' %
        (dataset_name, ", ".join(list(data_meta.keys()))))
  with catch_warnings_ignore(FutureWarning):
    ds = data_meta[dataset_name](override=override, verbose=verbose)
  # ******************** create SCO ******************** #
  if isinstance(ds, SingleCellOMIC):
    return ds
  # ******************** return ******************** #
  validating_dataset(ds)
  with catch_warnings_ignore(FutureWarning):
    sc = SingleCellOMIC(X=ds['X'],
                        cell_id=ds['X_row'],
                        gene_id=ds['X_col'],
                        name=dataset_name)
    if 'y' in ds:
      y = ds['y']
      if is_binary_dtype(y):
        sc.add_omic(OMIC.celltype, y, ds['y_col'])
      else:
        sc.add_omic(OMIC.proteomic, y, ds['y_col'])
  return sc
コード例 #3
0
def _adjust(fig, title, pad=0.02):
    w, h = fig.get_figwidth(), fig.get_figheight()
    fig.set_size_inches(w=w, h=h + 5)
    if title is not None:
        fig.suptitle(title)
    with catch_warnings_ignore(UserWarning):
        fig.tight_layout(rect=[0.0, pad, 1.0, 1.0 - pad])
コード例 #4
0
ファイル: helpers.py プロジェクト: imito/odin
def validate_features_dataset(output_dataset_path, ds_validation_path):
  ds = F.Dataset(output_dataset_path, read_only=True)
  print(ds)

  features = {}
  for key, val in ds.items():
    if 'indices_' in key:
      name = key.split('_')[-1]
      features[name] = (val, ds[name])

  all_indices = [val[0] for val in features.values()]
  # ====== sampling 250 files ====== #
  all_files = sampling_iter(it=all_indices[0].keys(), k=250,
                            seed=Config.SUPER_SEED)
  all_files = [f for f in all_files
               if all(f in ids for ids in all_indices)]
  print("#Samples:", ctext(len(all_files), 'cyan'))

  # ====== ignore the 20-figures warning ====== #
  with catch_warnings_ignore(RuntimeWarning):
    for file_name in all_files:
      X = {}
      for feat_name, (ids, data) in features.items():
        start, end = ids[file_name]
        X[feat_name] = data[start:end][:].astype('float32')
      V.plot_multiple_features(features=X, fig_width=20,
            title='[%s]%s' % (ds['dsname'][file_name], file_name))

  V.plot_save(ds_validation_path, dpi=12)
コード例 #5
0
    def test_normalization(self):
        ds = get_dataset('8kmy')
        # ignore overflow warning
        with catch_warnings_ignore(RuntimeWarning):
            ds1 = ds.expm1(omic=OMIC.transcriptomic, inplace=False)
            ds2 = ds.expm1(omic=OMIC.proteomic, inplace=False)
            self.assertTrue(np.all(np.expm1(ds.X) == ds1.X))
            self.assertTrue(
                np.all(
                    np.expm1(ds.numpy(OMIC.proteomic)) == ds2.numpy(
                        OMIC.proteomic)))

        ds1 = ds.normalize(OMIC.transcriptomic,
                           inplace=False,
                           log1p=True,
                           scale=False,
                           total=False)
        ds2 = ds.normalize(OMIC.proteomic,
                           inplace=False,
                           log1p=True,
                           scale=False,
                           total=False)
        self.assertTrue(
            np.all(ds1.numpy(OMIC.transcriptomic) == np.log1p(ds.X)))
        self.assertTrue(
            np.all(ds1.numpy(OMIC.proteomic) == ds.numpy(OMIC.proteomic)))
        self.assertTrue(
            np.all(
                ds2.numpy(OMIC.proteomic) == np.log1p(ds.numpy(
                    OMIC.proteomic))))
        self.assertTrue(
            np.all(
                ds2.numpy(OMIC.transcriptomic) == ds.numpy(
                    OMIC.transcriptomic)))
コード例 #6
0
ファイル: histogram_plot.py プロジェクト: trungnt13/odin-ai
def _fit_mapping(x: np.ndarray, y: np.ndarray, n_bins: int):
    from odin.utils import catch_warnings_ignore
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import KBinsDiscretizer
    from odin.ml.gmm_classifier import GMMclassifier
    assert x.ndim == 1 and y.ndim == 1
    x = x[:, np.newaxis]
    # already discrete labels, and the number of bins is enough
    if np.all(y == y.astype(np.int32)) and len(np.unique(y)) <= n_bins:
        n_bins = len(np.unique(y))
        model = GMMclassifier(strategy='all',
                              n_components=2,
                              covariance_type='full',
                              n_init=5,
                              random_state=1)
        model.fit(x, y)
    else:
        y = KBinsDiscretizer(n_bins=int(n_bins),
                             encode='ordinal',
                             strategy='uniform').fit_transform(y[:,
                                                                 np.newaxis])
        y = y.ravel().astype(np.int64)
        with catch_warnings_ignore(UserWarning):
            model = GridSearchCV(estimator=LogisticRegression(
                max_iter=500, solver='liblinear', random_state=1),
                                 cv=2,
                                 param_grid=dict(C=np.linspace(0.5, 5, num=5)))
            model.fit(x, y)
    return model, n_bins
コード例 #7
0
ファイル: helpers.py プロジェクト: professorlust/odin-ai
def validate_features_dataset(output_dataset_path, ds_validation_path):
    ds = F.Dataset(output_dataset_path, read_only=True)
    print(ds)

    features = {}
    for key, val in ds.items():
        if 'indices_' in key:
            name = key.split('_')[-1]
            features[name] = (val, ds[name])

    all_indices = [val[0] for val in features.values()]
    # ====== sampling 250 files ====== #
    all_files = sampling_iter(it=all_indices[0].keys(),
                              k=250,
                              seed=Config.SUPER_SEED)
    all_files = [f for f in all_files if all(f in ids for ids in all_indices)]
    print("#Samples:", ctext(len(all_files), 'cyan'))

    # ====== ignore the 20-figures warning ====== #
    with catch_warnings_ignore(RuntimeWarning):
        for file_name in all_files:
            X = {}
            for feat_name, (ids, data) in features.items():
                start, end = ids[file_name]
                X[feat_name] = data[start:end][:].astype('float32')
            V.plot_multiple_features(features=X,
                                     fig_width=20,
                                     title='[%s]%s' %
                                     (ds['dsname'][file_name], file_name))

    V.plot_save(ds_validation_path, dpi=12)
コード例 #8
0
 def test_visualization_celltype(self):
     sco = get_dataset('cortex')
     for X, var_names, rank_genes, clustering, dendrogram in itertools.product(
         ('cell', 'tran'), \
         (None, 10),
         (0, 3),
         ('kmeans', 'louvain', None),
         (True, False)):
         if X == 'cell' and rank_genes > 0:
             continue
         # check louvain available
         if clustering == 'louvain':
             try:
                 import louvain
             except ImportError:
                 continue
         # plotting
         with catch_warnings_ignore(ignore_warnings):
             sco.plot_heatmap(X=X,
                              groupby=OMIC.celltype,
                              var_names=var_names,
                              clustering=clustering,
                              rank_genes=rank_genes)
             sco.plot_dotplot(X=X,
                              groupby=OMIC.celltype,
                              var_names=var_names,
                              clustering=clustering,
                              rank_genes=rank_genes)
             sco.plot_stacked_violins(X=X,
                                      groupby=OMIC.celltype,
                                      var_names=var_names,
                                      clustering=clustering,
                                      rank_genes=rank_genes)
     sco.save_figures('/tmp/tmp2.pdf')
コード例 #9
0
ファイル: cross_analyze.py プロジェクト: trungnt13/sisua
def _analyze(ds_name, model_path, outpath, y_true, all_proteins, verbose):
    from sisua.analysis import Posterior
    with open(model_path, 'rb') as f:
        infer = pickle.load(f)
    ds_infer = infer.configs['dataset']
    ds = [j for i, j in all_datasets if i == ds_name][0]

    # path is a folder
    path = os.path.join(
        outpath, 'data%s_model%s' %
        (ds_name.replace('_', '').upper(), ds_infer.replace('_', '').upper()))
    path = os.path.join(path, infer.short_id)
    if not os.path.exists(path):
        os.mkdir(path)

    # log start
    if verbose:
        print("\nData:%s - Model:%s" %
              (ctext(ds_name, 'yellow'), ctext(ds_infer, 'yellow')))
        print(" Outpath:", ctext(path, 'cyan'))

    # create a mixed Posterior
    pos = Posterior(infer, ds=ds)
    # a lot of figures so RuntimeWarning about maximum amount
    # of figure will be appeared
    with catch_warnings_ignore(RuntimeWarning):
        # analysis
        pos.new_figure().plot_latents_binary_scatter(
            size=4).plot_latents_distance_heatmap(
            ).plot_correlation_marker_pairs()
        # protein series
        if infer.is_semi_supervised:
            y_pred = {
                i: j
                for i, j in zip(
                    dict(all_datasets)[ds_infer]['y_col'],
                    infer.predict_y(ds['X']).T) if i in all_proteins
            }
            y_pred = np.hstack(
                [y_pred[i][:, np.newaxis] for i in all_proteins])
            pos.plot_protein_predicted_series(y_true_new=y_true,
                                              y_pred_new=y_pred,
                                              labels_new=all_proteins)
            for prot_name in all_proteins:
                pos.plot_protein_scatter(protein_name=prot_name,
                                         y_true_new=y_true,
                                         y_pred_new=y_pred,
                                         labels_new=all_proteins)
        # save plot and show log
        pos.save_plots(path, dpi=80)
コード例 #10
0
ファイル: latent_benchmarks.py プロジェクト: trungnt13/sisua
def clustering_scores(latent, labels, n_labels, prediction_algorithm='both'):
    """ Clustering Scores:

   * silhouette_score (higher is better, best is 1, worst is -1)
   * adjusted_rand_score (higher is better)
   * normalized_mutual_info_score (higher is better)
   * unsupervised_clustering_accuracy (higher is better)

  note: remember the order of returned value

  Parameters
  ----------
  labels : categorical labels (i.e. single classes or one-hot encoded)
  prediction_algorithm : {'knn', 'gmm', 'both'}
  """
    # simple normalization to 0-1, then pick the argmax
    if labels.ndim == 2:
        min_val = np.min(labels, axis=0, keepdims=True)
        max_val = np.max(labels, axis=0, keepdims=True)
        labels = (labels - min_val) / (max_val - min_val)
        labels = np.argmax(labels, axis=-1)

    if prediction_algorithm == 'knn':
        km = KMeans(n_labels, n_init=200, random_state=5218)
        labels_pred = km.fit_predict(latent)
    elif prediction_algorithm == 'gmm':
        gmm = GaussianMixture(n_labels, random_state=5218)
        gmm.fit(latent)
        labels_pred = gmm.predict(latent)
    elif prediction_algorithm == 'both':
        score1 = clustering_scores(latent,
                                   labels,
                                   n_labels=n_labels,
                                   prediction_algorithm='knn')
        score2 = clustering_scores(latent,
                                   labels,
                                   n_labels=n_labels,
                                   prediction_algorithm='gmm')
        return {k: (v + score2[k]) / 2 for k, v in score1.items()}
    else:
        raise ValueError("Not support for prediction_algorithm: '%s'" %
                         prediction_algorithm)
    #
    with catch_warnings_ignore(FutureWarning):
        asw_score = silhouette_score(latent, labels)
        ari_score = adjusted_rand_score(labels, labels_pred)
        nmi_score = normalized_mutual_info_score(labels, labels_pred)
        uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
    return dict(ASW=asw_score, ARI=ari_score, NMI=nmi_score, UCA=uca_score)
コード例 #11
0
ファイル: latent_benchmarks.py プロジェクト: trungnt13/sisua
def multi_label_adj_Rindex(label_bin_true, label_pred):
    assert label_bin_true.ndim == 2
    assert label_bin_true.shape[1] == len(np.unique(label_pred))
    n_classes = label_bin_true.shape[1]
    with catch_warnings_ignore(Warning):
        scores = []
        for y in label_bin_true.T:
            y = y.astype('int32')
            s = max(
                adjusted_rand_score(labels_true=y,
                                    labels_pred=(
                                        label_pred == i).astype('int32'))
                for i in range(n_classes))
            scores.append(s)
    return scores
コード例 #12
0
    def test_metrics(self):
        sco = get_dataset('8kmy')
        with catch_warnings_ignore(ConvergenceWarning):
            sco.rank_vars_groups(clustering='kmeans')
            sco.calculate_quality_metrics()
            with sco._swap_omic('prot'):
                sco.rank_vars_groups(clustering='kmeans')
                sco.calculate_quality_metrics()

            if _SCVI:
                sco = get_dataset('cortex')
                sco.rank_vars_groups(clustering='kmeans')
                sco.calculate_quality_metrics()
                with sco._swap_omic('cell'):
                    sco.rank_vars_groups(clustering='kmeans')
                    sco.calculate_quality_metrics()
コード例 #13
0
ファイル: histogram_plot.py プロジェクト: tirkarthi/odin-ai
def _fit(x, y, n_bins):
    from sklearn.preprocessing import KBinsDiscretizer
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from odin.utils import catch_warnings_ignore
    x = x[:, np.newaxis]
    y = KBinsDiscretizer(n_bins=int(n_bins), encode='ordinal').fit_transform(
        y[:, np.newaxis]).ravel().astype(np.int64)
    with catch_warnings_ignore(UserWarning):
        lr = GridSearchCV(estimator=LogisticRegression(max_iter=500,
                                                       solver='liblinear',
                                                       random_state=1234),
                          cv=2,
                          param_grid=dict(C=np.linspace(0.5, 5, num=5)))
        lr.fit(x, y)
    return lr
コード例 #14
0
ファイル: analyze.py プロジェクト: imito/odin
  def _report(y_p, y_t, pad=''):
    with catch_warnings_ignore(Warning):
      z_ = np.concatenate(y_p, axis=0)
      z = np.concatenate(y_t, axis=0)
      print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

      z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p],
                          axis=0)
      z = np.array([i[0] for i in y_t])
      print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
コード例 #15
0
ファイル: evaluate.py プロジェクト: medical-projects/sisua
def robust_run(method_name, log_text, fn, *args, **kwargs):
    r""" Run an evaluation function and catch exception without interupting the
  execution """
    assert callable(fn)
    with catch_warnings_ignore(UserWarning):
        try:
            fn(*args, **kwargs)
        except Exception as e:
            text = StringIO()
            traceback.print_exception(*sys.exc_info(),
                                      limit=None,
                                      file=text,
                                      chain=True)
            text.seek(0)
            text = text.read().strip()
            text += f"\n{e}"
            SE.write_error(traceback=text,
                           method_name=method_name,
                           config=log_text)
コード例 #16
0
 def prepare(self):
     with catch_warnings_ignore(RuntimeWarning):
         sco = get_dataset('cortex')
         om1, om2 = sco.omics
         train, test = sco.split(train_percent=0.8, seed=1)
         n_gene = sco.numpy(om1).shape[1]
         n_prot = sco.numpy(om2).shape[1]
         rvs = [
             RandomVariable(n_gene, 'zinbd', om1.name),
             RandomVariable(n_prot, 'onehot', om2.name)
         ]
         all_models = [
             DeepCountAutoencoder, SCALE, SCVI, VariationalAutoEncoder
         ]
         all_configs = [
             NetworkConfig(),
             NetworkConfig(pyramid=True),
             NetworkConfig(use_conv=True),
             NetworkConfig(pyramid=True, use_conv=True)
         ]
         return train, test, rvs, all_models, all_configs
コード例 #17
0
ファイル: metrics.py プロジェクト: tirkarthi/odin-ai
def _clustering_scores(y, X=None, z=None, algo='kmeans', random_state=1):
    n_factors = len(np.unique(y))
    if z is None:
        if algo == 'kmeans':
            model = KMeans(n_factors, n_init=200, random_state=random_state)
        elif algo == 'gmm':
            model = GaussianMixture(n_factors, random_state=random_state)
        elif algo in ('both', 'avg', 'avr', 'average', 'mean'):
            score1 = _clustering_scores(X=X,
                                        y=y,
                                        z=z,
                                        algo='kmeans',
                                        random_state=random_state)
            score2 = _clustering_scores(X=X,
                                        y=y,
                                        z=z,
                                        algo='gmm',
                                        random_state=random_state)
            return {k: (v + score2[k]) / 2 for k, v in score1.items()}
        else:
            raise ValueError("Not support for prediction_algorithm: '%s'" %
                             algo)
        # the scores
        y_pred = model.fit_predict(X)
    else:
        z = z.ravel()
        assert z.shape[0] == y.shape[0], \
          f"predictions must have shape: {y.shape}, but given: {z.shape}"
        y_pred = z
    with catch_warnings_ignore(FutureWarning):
        return dict(
            ASW=silhouette_score(
                X if X is not None else np.expand_dims(z, axis=-1), y),
            ARI=adjusted_rand_score(y, y_pred),
            NMI=normalized_mutual_info_score(y, y_pred),
            UCA=_unsupervised_clustering_accuracy(y, y_pred)[0],
            HOS=homogeneity_score(y, y_pred),
            COS=_cluster_completeness_score(y, y_pred),
        )
コード例 #18
0
ファイル: analyze.py プロジェクト: trungnt13/odin-ai
    def _report(y_p, y_t, pad=''):
        with catch_warnings_ignore(Warning):
            z_ = np.concatenate(y_p, axis=0)
            z = np.concatenate(y_t, axis=0)
            print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

            z_ = np.concatenate(
                [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0)
            z = np.array([i[0] for i in y_t])
            print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
コード例 #19
0
# ===========================================================================
# ====== basic path ====== #
output_dataset_path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)

processor_log_path = os.path.join(EXP_DIR, 'processor_%s.log' % FEATURE_RECIPE)
if os.path.exists(processor_log_path):
    os.remove(processor_log_path)
print("Log path:", ctext(processor_log_path, 'cyan'))

ds_validation_path = os.path.join(EXP_DIR, 'validate_%s.pdf' % FEATURE_RECIPE)
if os.path.exists(ds_validation_path):
    os.remove(ds_validation_path)
print("Validation path:", ctext(ds_validation_path, 'cyan'))

# ====== running the processing ====== #
with catch_warnings_ignore(Warning):
    processor = pp.FeatureProcessor(jobs=ALL_FILES,
                                    path=output_dataset_path,
                                    extractor=recipe,
                                    n_cache=320,
                                    ncpu=NCPU,
                                    override=True,
                                    identifier='name',
                                    log_path=processor_log_path,
                                    stop_on_failure=False)
    processor.run()
# ===========================================================================
# Make some visualization
# ===========================================================================
validate_features_dataset(output_dataset_path, ds_validation_path)
コード例 #20
0
 def test_save_load_2(self):
     r""" Load and train the model """
     print("*** Test loading model ***")
     from matplotlib import pyplot as plt
     from odin import visual as vs
     import seaborn as sns
     sns.set()
     #
     train, test, rvs, models, configs = self.prepare()
     for (MODEL, network, is_semi, path, log_path, pca_path, stat_path,
          hist_path) in model_iteration(models, configs):
         with open(log_path, 'rb') as f:
             log = pickle.load(f)
         model = load(path)
         with catch_warnings_ignore(UserWarning):
             # test statistics
             plt.figure(figsize=(12, 5))
             text_train, p_train, zmean_train, zvar_train = predict2info(
                 model, train)
             text_test, p_test, zmean_test, zvar_test = predict2info(
                 model, test)
             # check latent mean and variance
             zmean_train1, zmean_test1 = log['zmean']
             zvar_train1, zvar_test1 = log['zvar']
             self.assertTrue(np.allclose(zmean_train, zmean_train1))
             self.assertTrue(np.allclose(zmean_test, zmean_test1))
             self.assertTrue(np.allclose(zvar_train, zvar_train1))
             self.assertTrue(np.allclose(zvar_test, zvar_test1))
             # plotting
             plt.subplot(1, 2, 1)
             plt.plot(tf.math.log(text_train), label='Loaded')
             plt.plot(tf.math.log(log['predict_train']), label='Saved')
             plt.title("Train")
             plt.legend()
             plt.subplot(1, 2, 2)
             plt.plot(tf.math.log(text_test), label='Loaded')
             plt.plot(tf.math.log(log['predict_test']), label='Saved')
             plt.title("Test")
             plt.legend()
             plt.tight_layout()
             vs.plot_save(stat_path, dpi=120, clear_all=True, log=True)
             # test pca
             pca = extract_pca(p_train, p_test)
             plt.figure(figsize=(8, 3 * len(pca)))
             for i, (dist, old,
                     new) in enumerate(zip(p_train, log['pca'], pca)):
                 assert old.shape == new.shape
                 plt.subplot(len(pca), 2, i * 2 + 1)
                 plt.scatter(old[:, 0], old[:, 1], s=4)
                 if i == 0:
                     plt.title('Saved')
                 plt.ylabel(dist.name)
                 #
                 plt.subplot(len(pca), 2, i * 2 + 2)
                 plt.scatter(new[:, 0], new[:, 1], s=4)
                 if i == 0:
                     plt.title('Loaded')
             plt.tight_layout()
             vs.plot_save(pca_path, dpi=120, clear_all=True, log=True)
             #
             model.fit(train, epochs=2, verbose=False)
             model.plot_learning_curves()
             model.save_figures(hist_path)
コード例 #21
0
 def _initialize(self):
     scm = self.scm
     sco = self.sco_corrupted
     outputs, latents = scm.predict(
         sco.create_dataset(self.scm.output_layers[0].name,
                            batch_size=self.batch_size,
                            shuffle=0,
                            drop_remainder=False),
         sample_shape=self.sample_shape,
         verbose=self.verbose,
     )
     # infer output OMICs
     dim2omic = defaultdict(list)
     for om in self.input_omics:
         dim2omic[self.sco_original.get_dim(om)].append(om)
     for o in tf.nest.flatten(outputs):
         assert isinstance(o, tfd.Distribution), \
           f"SingleCellModel must output Distribution but return {o}"
         name = o.name
         try:
             om = OMIC.parse(name)
         except Exception:
             om = None
         if om is None:
             oms = dim2omic[o.event_shape[0]]
             if len(oms) > 1:
                 raise RuntimeError(
                     f"Cannot infer OMIC type for output {o}")
             om = oms[0]
         self.output_omics.append(om.name)
     # variables' description
     self._n_latents = len(tf.nest.flatten(latents))
     self._n_outputs = len(tf.nest.flatten(outputs))
     ## default inputs
     for om in self.input_omics:
         self.omics_data[(om, 'corrupted')] = sco.get_omic(om)
     # latent is the same for all
     self.omics_data[(OMIC.latent.name,
                      'corrupted')] = tf.nest.flatten(latents)
     # infer if the distribution is imputed
     for l, o in zip(scm.output_layers, tf.nest.flatten(outputs)):
         self.omics_data[(l.name, 'reconstructed')] = o
         is_independent = 0
         if isinstance(o, tfd.Independent):
             is_independent = o.reinterpreted_batch_ndims
             o = o.distribution
         if isinstance(o, tfd.ZeroInflated):
             o = o.count_distribution
         if is_independent > 0:
             o = tfd.Independent(o,
                                 reinterpreted_batch_ndims=is_independent)
         self.omics_data[(l.name, 'imputed')] = o
     ### create the SingleCellOMIC dataset for analysis
     sco = self.sco_original.copy()
     for om in self.input_omics:
         if (om, 'imputed') in self.omics_data:
             data_type = 'imputed'
         elif (om, 'reconstructed') in self.omics_data:
             data_type = 'reconstructed'
         else:
             continue
         data = self.omics_data[(om, data_type)]
         om_new = OMIC.parse(f'i{om}')
         # prepare the new data
         if isinstance(data, tfd.Distribution):
             data = data.mean().numpy()
             if data.ndim == 3:
                 data = np.mean(data, axis=0)
         # find the variable's names
         if om in self.scm.metadata:
             var_names = self.scm.metadata[om]
         else:
             var_names = np.array(
                 [f'{om}{i}' for i in range(data.shape[1])])
         sco.add_omic(omic=om_new, X=data, var_names=var_names)
     # add the latents
     Zs = self.omics_data[('latent', 'corrupted')]
     if len(Zs) > 1:
         means = [z.mean() for z in Zs]
         Zs = self.reduce_latents(means)
     else:
         Zs = Zs[0].mean()
     with catch_warnings_ignore(UserWarning, RuntimeWarning):
         sco.add_omic(omic=OMIC.latent,
                      X=Zs.numpy(),
                      var_names=np.array(
                          [f'Z{i}' for i in range(Zs.shape[1])]))
     # store the extracted SingleCellOMIC dataset
     self._dataset = sco
コード例 #22
0
    def get_criticizer(self,
                       factor_omic='proteomic',
                       latent_indices=None,
                       n_bins=5,
                       strategy='quantile') -> Criticizer:
        r""" Create a probabilistic criticizer for evaluating the latent codes of
    variational models.

    Arguments:
      factor_omic : instance of OMIC.
        which OMIC type be used as factors (or labels).
      n_bins : int (default=8)
        The number of bins to produce discretized factors.
      strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile')
        Strategy used to define the widths of the bins.
        uniform - All bins in each feature have identical widths.
        quantile - All bins in each feature have the same number of points.
        kmeans - Values in each bin have the same nearest center of a 1D
          k-means cluster.
    """
        sco = self.dataset
        assert factor_omic in sco.omics, \
          f"factor_omic='{factor_omic}' not found, available are: {sco.omics}"
        factor_omic = OMIC.parse(factor_omic)
        if latent_indices is None:
            key = f"{factor_omic.name}"
        else:
            name = '_'.join(f'{i:d}' for i in latent_indices)
            key = f"{factor_omic.name}{name}"
        # create the Criticizer
        if key not in self._criticizers:
            # check the factors is valid
            factors = sco.numpy(factor_omic)
            factor_names = sco.get_var_names(factor_omic)
            kw = dict(n_bins=int(n_bins), strategy=None)
            # binary classes
            if np.all(np.sum(factors, axis=1) == 1):
                factors = np.argmax(factors, axis=1)[:, np.newaxis]
                factor_names = np.asarray([factor_omic.name])
            # continuous or discrete cases
            elif factor_omic in (OMIC.proteomic, OMIC.iproteomic, OMIC.pmhc,
                                 OMIC.ipmhc):
                kw['strategy'] = strategy
            # categorical factors
            elif factor_omic in (OMIC.progenitor, OMIC.iprogenitor,
                                 OMIC.celltype, OMIC.icelltype):
                pass
            # unknown factor
            else:
                warnings.warn(
                    f"No support for discretization of OMIC: {factor_omic}",
                    RuntimeWarning)
                return
            # only valid factors with > 1 classes
            ids = [len(np.unique(i)) > 1 for i in factors.T]
            if not any(ids):  # no valid factor found
                warnings.warn(f"Not a valid factor: {factor_omic.name}",
                              RuntimeWarning)
                return
            factors = factors[:, ids]
            factor_names = factor_names[ids]
            # create the criticizer
            crt = Criticizer(vae=self.scm,
                             latent_indices=latent_indices,
                             random_state=self.rand.randint(1e8))
            crt.factor_omic: OMIC = factor_omic
            with catch_warnings_ignore(UserWarning):
                latents = self.omics_data[('latent', 'corrupted')]
                crt.sample_batch(latents=latents,
                                 factors=factors,
                                 factor_names=factor_names,
                                 **kw)
            self._criticizers[key] = crt
        return self._criticizers[key]
コード例 #23
0
ファイル: gmm_embedding.py プロジェクト: trungnt13/odin-ai
    def plot_diagnosis(self, X, labels=None, n_bins=200):
        X, labels, n_classes = self._check_input(X, labels)

        nrow = n_classes
        ncol = 1
        fig = plot_figure(nrow=nrow * 2, ncol=8)
        # add 1 for threshold color
        # add 1 for PDF color
        colors = sns.color_palette(n_colors=self.n_components_per_class + 2)

        for i, (name, (order, gmm)) in enumerate(zip(labels, self._models)):
            start = ncol * i

            means_ = gmm.means_.ravel()[order]
            precision_ = gmm.precisions_.ravel()[order]
            x = self.normalize(X[:, i], test_mode=False)

            # ====== scores ====== #
            # score
            score_llk = gmm.score(x[:, np.newaxis])
            score_bic = gmm.bic(x[:, np.newaxis])
            score_aic = gmm.aic(x[:, np.newaxis])

            # ====== the histogram ====== #
            ax = plt.subplot(nrow, ncol, start + 1)
            count, bins = _draw_hist(x,
                                     ax=ax,
                                     title="[%s] LLK:%.2f BIC:%.2f AIC:%.2f" %
                                     (name, score_llk, score_bic, score_aic),
                                     n_bins=n_bins,
                                     show_yticks=True)

            # ====== draw GMM PDF ====== #
            y_ = np.exp(gmm.score_samples(bins[:, np.newaxis]))
            y_ = (y_ - np.min(y_)) / (np.max(y_) - np.min(y_)) * np.max(count)
            ax.plot(bins,
                    y_,
                    color='red',
                    linestyle='-',
                    linewidth=1.5,
                    alpha=0.6)

            # ====== draw the threshold ====== #
            ci = stats.norm.interval(
                np.abs(self.ci_threshold),
                loc=gmm.means_[order[self.positive_component]],
                scale=np.sqrt(1 /
                              gmm.precisions_[order[self.positive_component]]))
            threshold = ci[0] if self.ci_threshold < 0 else ci[1]
            ids = np.where(bins >= threshold, True, False)
            ax.fill_between(bins[ids],
                            y1=0,
                            y2=np.max(count),
                            facecolor=colors[-2],
                            alpha=0.3)
            ax.text(np.min(bins[ids]), np.min(count), "%.2f" % threshold)

            # ====== plot GMM probability ====== #
            x_ = np.linspace(np.min(bins), np.max(bins), 1200)
            y_ = gmm.predict_proba(x_[:, np.newaxis]) * np.max(count)
            for c, j in zip(colors, y_.T):
                plt.plot(x_,
                         j,
                         color=c,
                         linestyle='--',
                         linewidth=1.8,
                         alpha=0.6)

            # ====== draw the each Gaussian bell ====== #
            ax = ax.twinx()
            _x = np.linspace(start=np.min(x), stop=np.max(x), num=800)
            for c, m, p in zip(colors, means_, precision_):
                with catch_warnings_ignore(Warning):
                    j = mlab.normpdf(_x, m, np.sqrt(1 / p))
                ax.plot(_x, j, color=c, linestyle='-', linewidth=1)
                ax.scatter(_x[np.argmax(j)],
                           np.max(j),
                           s=66,
                           alpha=0.8,
                           linewidth=0,
                           color=c)
            ax.yaxis.set_ticklabels([])

        fig.tight_layout()
        self.add_figure('diagnosis', fig)
        return self
コード例 #24
0
ファイル: cross_datasets.py プロジェクト: trungnt13/sisua
def train_and_evaluate(model_name, train_ds):
    if model_name == 'dca':
        from sisua.inference import InferenceDCA as Inference
    elif model_name == 'scvae':
        from sisua.inference import InferenceSCVAE as Inference
    elif model_name == 'sisua':
        from sisua.inference import InferenceSISUA as Inference
    elif model_name == 'scvi':
        from sisua.inference import InferenceSCVI as Inference
    else:
        raise NotImplementedError
    from sisua.analysis import Posterior

    outpath = os.path.join(FIGURE_PATH,
                           '%s_train%s' % (model_name, train_ds.upper()))
    if not os.path.exists(outpath):
        os.mkdir(outpath)

    print("\n======== Running experiment ========")
    print("Model     :", ctext(model_name, 'cyan'))
    print("Inference :", ctext(Inference, 'cyan'))
    print("Train data:", ctext(train_ds, 'cyan'))
    print("Out path  :", ctext(outpath, 'cyan'))

    ds, gene, prot = all_datasets[train_ds]
    n_prots = prot.feat_dim
    org_prot = [standardize_protein_name(i) for i in prot.col_name]

    # ====== Main model training ====== #
    if model_name == 'sisua':
        model = Inference(gene_dim=n_genes, prot_dim=n_prots)
    else:
        model = Inference(gene_dim=n_genes)
    model.fit(X=gene.X_train,
              y=prot.X_train if model.is_semi_supervised else None,
              corruption_rate=corruption_rate,
              corruption_dist=corruption_dist,
              n_epoch=n_epoch,
              batch_size=batch_size,
              detail_logging=False)

    # ====== start evaluation ====== #
    for name, (ds, gene, prot) in all_datasets.items():
        y_true = {
            i: j
            for i, j in zip(
                [standardize_protein_name(i)
                 for i in prot.col_name], ds['y'].T) if i in all_proteins
        }
        # preserve the same order of all_proteins
        y_true = np.hstack([y_true[i][:, np.newaxis] for i in all_proteins])
        prot = SingleCellOMIC(matrix=y_true,
                              rowname=ds['X_row'],
                              colname=all_proteins)

        # create a mixed Posterior
        pos = Posterior(model,
                        ds=dict(X_train=gene.X_train,
                                X_test=gene.X_test,
                                X_col=gene.col_name,
                                y_train=prot.X_train,
                                y_test=prot.X_test,
                                y_col=prot.col_name))
        # a lot of figures so RuntimeWarning about maximum amount
        # of figure will be appeared
        with catch_warnings_ignore(RuntimeWarning):
            # analysis
            pos.new_figure().plot_latents_binary_scatter(
                size=4).plot_latents_distance_heatmap(
                ).plot_correlation_marker_pairs()
            # protein series
            if model.is_semi_supervised:
                y_true = pos.y_test
                y_pred = model.predict_y(pos.X_test)
                y_pred = {
                    i: j
                    for i, j in zip(org_prot, y_pred.T) if i in all_proteins
                }
                y_pred = np.hstack(
                    [y_pred[i][:, np.newaxis] for i in all_proteins])
                pos.plot_protein_predicted_series(y_true_new=y_true,
                                                  y_pred_new=y_pred,
                                                  labels_new=all_proteins)
                for prot_name in all_proteins:
                    pos.plot_protein_scatter(protein_name=prot_name,
                                             y_true_new=y_true,
                                             y_pred_new=y_pred,
                                             labels_new=all_proteins)
            # save plot and show log
            pos.save_plots(os.path.join(outpath, '%s.pdf' % name), dpi=80)
コード例 #25
0
ファイル: figures.py プロジェクト: Daisey666/odin-ai
def plot_gaussian_mixture(x,
                          gmm,
                          bins=80,
                          fontsize=12,
                          linewidth=2,
                          show_pdf=False,
                          show_probability=False,
                          show_components=True,
                          legend=True,
                          ax=None,
                          title=None):
    import seaborn as sns
    from odin.utils import as_tuple, catch_warnings_ignore
    from scipy import stats
    from sklearn.mixture import GaussianMixture
    ax = to_axis(ax, is_3D=False)
    n_points = int(bins * 12)
    assert gmm.means_.shape[1] == 1, "Only support plotting 1-D series GMM"
    x = x.ravel()
    order = np.argsort(gmm.means_.ravel())
    means_ = gmm.means_.ravel()[order]
    precision_ = gmm.precisions_.ravel()[order]
    colors = sns.color_palette(n_colors=gmm.n_components + 2)
    # ====== Histogram ====== #
    count, bins = plot_histogram(x=x,
                                 bins=int(bins),
                                 ax=ax,
                                 normalize=False,
                                 kde=False,
                                 range_0_1=False,
                                 covariance_factor=0.25,
                                 centerlize=False,
                                 fontsize=fontsize,
                                 alpha=0.25,
                                 title=title)
    ax.set_ylabel("Histogram Count", fontsize=fontsize)
    ax.set_xlim((np.min(x), np.max(x)))
    ax.set_xticks(
        np.linspace(start=np.min(x), stop=np.max(x), num=5, dtype='float32'))
    ax.set_yticks(
        np.linspace(start=np.min(count),
                    stop=np.max(count),
                    num=5,
                    dtype='int32'))
    # ====== GMM PDF ====== #
    x_ = np.linspace(np.min(bins), np.max(bins), n_points)
    y_ = np.exp(gmm.score_samples(x_[:, np.newaxis]))
    y_ = (y_ - np.min(y_)) / (np.max(y_) - np.min(y_)) * np.max(count)
    if show_pdf:
        ax.plot(x_,
                y_,
                color='red',
                linestyle='-',
                linewidth=linewidth * 1.2,
                alpha=0.6,
                label="GMM log-likelihood")
    # ====== GMM probability ====== #
    twinx = None
    ymax = 0.0
    if show_probability:
        if twinx is None:
            twinx = ax.twinx()
        y_ = gmm.predict_proba(x_[:, np.newaxis])
        for idx, (c, j) in enumerate(zip(colors, y_.T)):
            twinx.plot(x_,
                       j,
                       color=c,
                       linestyle='--',
                       linewidth=linewidth,
                       alpha=0.8,
                       label=r"$p_{\#%d}(x)$" % idx)
        ymax = max(ymax, np.max(y_))
    # ====== draw the each Gaussian bell ====== #
    if show_components:
        if twinx is None:
            twinx = ax.twinx()
        for idx, (c, m, p) in enumerate(zip(colors, means_, precision_)):
            with catch_warnings_ignore(Warning):
                j = stats.norm.pdf(x_, m, np.sqrt(1 / p))
            twinx.plot(x_,
                       j,
                       color=c,
                       linestyle='-',
                       linewidth=linewidth,
                       label=r"$PDF_{\#%d}$" % idx)
            # mean, top of the bell
            twinx.scatter(x_[np.argmax(j)],
                          np.max(j),
                          s=88,
                          alpha=0.8,
                          linewidth=0,
                          color=c)
            ymax = max(ymax, np.max(j))
        twinx.set_ylabel("Probability Density", fontsize=fontsize)
        twinx.grid(False)
    # set the limit for twinx
    if twinx is not None:
        twinx.set_ylim(0.0, ymax * 1.05)
    # ====== show legend ====== #
    if twinx is not None:
        twinx.yaxis.label.set_color(colors[0])
        twinx.tick_params(axis='y', colors=colors[0])
    if legend:
        ax.legend(fontsize=fontsize)
        if twinx is not None:
            twinx.legend(fontsize=fontsize)
    return ax
コード例 #26
0
# ===========================================================================
# ====== basic path ====== #
output_dataset_path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)

processor_log_path = os.path.join(EXP_DIR, 'processor_%s.log' % FEATURE_RECIPE)
if os.path.exists(processor_log_path):
  os.remove(processor_log_path)
print("Log path:", ctext(processor_log_path, 'cyan'))

ds_validation_path = os.path.join(EXP_DIR, 'validate_%s.pdf' % FEATURE_RECIPE)
if os.path.exists(ds_validation_path):
  os.remove(ds_validation_path)
print("Validation path:", ctext(ds_validation_path, 'cyan'))

# ====== running the processing ====== #
with catch_warnings_ignore(Warning):
  processor = pp.FeatureProcessor(
      jobs=ALL_FILES,
      path=output_dataset_path,
      extractor=recipe,
      n_cache=320,
      ncpu=NCPU,
      override=True,
      identifier='name',
      log_path=processor_log_path,
      stop_on_failure=False)
  processor.run()
# ===========================================================================
# Make some visualization
# ===========================================================================
validate_features_dataset(output_dataset_path, ds_validation_path)
コード例 #27
0
    def plot_correlation_marker_pairs(self, test=True, fontsize=8):
        start_time = time.time()

        from scipy.stats import pearsonr, spearmanr
        n_system = len(self)
        data_type = 'test' if test else 'train'

        # OrderDict(name -> series)
        original_series = None
        imputed_series = []
        for pos in self:
            if test:
                v, x, y = pos.V_test, pos.X_test_org, pos.y_test
            else:
                v, x, y = pos.V_train, pos.X_train_org, pos.y_train
            if original_series is None:
                original_series = correlation_scores(X=x,
                                                     y=y,
                                                     gene_name=pos.gene_name,
                                                     protein_name=pos.labels,
                                                     return_series=True)
            imputed_series.append(
                correlation_scores(X=v,
                                   y=y,
                                   gene_name=pos.gene_name,
                                   protein_name=pos.labels,
                                   return_series=True))

        # ====== plotting ====== #
        n_pair = len(original_series)
        fig = plt.figure(figsize=(20, 5 * n_pair), constrained_layout=True)
        width = 4
        grids = fig.add_gridspec(n_pair, (n_system + 1) * width)

        for row_idx, prot_gene in enumerate(original_series.keys()):
            prot_name, gene_name = prot_gene.split('/')
            original_gene, prot = original_series[prot_gene]

            # gather all series
            gene = [original_gene]
            system_name = ["Original"]
            for s, posetrior in zip(imputed_series, self.posteriors):
                i, j = s[prot_gene]
                assert np.all(prot == j)
                gene.append(i)
                system_name.append(posetrior.short_id_lines)

            # plotting each series
            for col_idx, (name, g) in enumerate(zip(system_name, gene)):
                ax = fig.add_subplot(grids[row_idx, width *
                                           col_idx:(width * col_idx + width)])
                ax.scatter(prot, g, s=25, alpha=0.6, linewidths=0)
                plot_aspect('auto', 'box', ax)

                title = data_type + ' - ' + prot_gene + ' - %s' if col_idx == 0 else "%s"
                title += '\nPearson:%.2f Spearman:%.2f'
                ax.set_title(title % (name, pearsonr(
                    g, prot)[0], spearmanr(g, prot).correlation),
                             fontsize=fontsize + (2 if col_idx == 0 else 0))
                if col_idx == 0:
                    ax.set_xlabel('[Protein] %s' % prot_name,
                                  fontsize=fontsize)
                    ax.set_ylabel('[Gene] %s' % gene_name, fontsize=fontsize)

                if np.mean(g) < 0.1:
                    for tick in ax.yaxis.get_major_ticks():
                        tick.label.set_fontsize(6)
                # ax = fig.add_subplot(
                #     grids[row_idx, (width * col_idx + width - 1): (width * col_idx + width)])
                ax = ax.twiny()
                ax.boxplot(g)
                ax.set_xticks(())
                # ax.set_xlabel(gene_name, fontsize=fontsize)

        with catch_warnings_ignore(UserWarning):
            plt.tight_layout()
        self.add_figure('correlation_%s' % data_type, fig)
        return self._log(
            'plot_correlation_marker_pairs[%s] %s(s)' %
            (data_type, ctext(time.time() - start_time, 'lightyellow')))
コード例 #28
0
    def plot_imputation_scatter(self,
                                test=True,
                                pca=False,
                                color_by_library=True):
        start_time = time.time()
        n_system = len(self) + 2  # add the original and the corrupted
        data_type = 'test' if test else 'train'

        if n_system <= 5:
            nrow = 1
            ncol = n_system
        else:
            nrow = 2
            ncol = int(np.ceil(n_system / 2))

        X_org = self.posteriors[0].X_test_org if test else self.posteriors[
            0].X_train_org
        X_crr = self.posteriors[0].X_test if test else self.posteriors[
            0].X_train
        y = self.posteriors[0].y_test if test else self.posteriors[0].y_train
        labels = self.posteriors[0].labels
        is_binary_classes = self.posteriors[0].is_binary_classes
        allV = [X_org, X_crr] + [
            pos.V_test if test else pos.V_train for pos in self.posteriors
        ]
        assert X_org.shape == X_crr.shape and all(v.shape == X_org.shape
                                                  for v in allV)
        all_names = ["[%s]Original" % data_type,
                     "[%s]Corrupted" % data_type
                     ] + [i.short_id_lines for i in self.posteriors]

        # log-normalize everything
        if len(X_org) > 5000:
            np.random.seed(5218)
            ids = np.random.permutation(X_org.shape[0])[:5000]
            allV = [v[ids] for v in allV]
            y = y[ids]

        if is_binary_classes:
            y = np.argmax(y, axis=-1)
        else:
            y = ProbabilisticEmbedding().fit_transform(y)
            y = np.argmax(y, axis=-1)

        allV = [log_norm(v) for v in allV]

        fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5))
        for idx, (name, v) in enumerate(zip(all_names, allV)):
            ax = plt.subplot(nrow, ncol, idx + 1)
            n = np.sum(v, axis=-1)
            v = fast_pca(v, n_components=2) if pca else fast_tsne(
                v, n_components=2)
            with catch_warnings_ignore(Warning):
                if color_by_library:
                    plot_scatter(x=v,
                                 val=n,
                                 ax=ax,
                                 size=8,
                                 legend_enable=False,
                                 grid=False,
                                 title=name)
                else:
                    plot_scatter(x=v,
                                 color=[labels[i] for i in y],
                                 marker=[labels[i] for i in y],
                                 ax=ax,
                                 size=8,
                                 legend_enable=True if idx == 0 else False,
                                 grid=False,
                                 title=name)

        with catch_warnings_ignore(Warning):
            plt.tight_layout()
        self.add_figure(
            'imputation_scatter_%s_%s' %
            ('lib' if color_by_library else 'cell', data_type), fig)
        return self._log(
            'plot_imputation_scatter[%s] %s(s)' %
            (data_type, ctext(time.time() - start_time, 'lightyellow')))
コード例 #29
0
ファイル: evaluate.py プロジェクト: medical-projects/sisua
def main(model,
         ds1,
         ds2,
         batch_size,
         score_enable,
         plot_enable,
         override=False):
    print("Start evaluation:")
    print(f" - model     : {model}")
    print(f" - dataset1  : {ds1}")
    print(f" - dataset2  : {ds2}")
    print(f" - batch_size: {batch_size}")
    print(f" - override  : {override}")
    print(f" - plot:{plot_enable} score:{score_enable}")
    result_dir = SE.get_result_dir()
    if len(ds2) == 0:
        outpath = os.path.join(result_dir, f"{model}_{ds1}")
    else:
        outpath = os.path.join(result_dir, f"{model}_{ds1}_{ds2}")
    # overriding exist paths
    if override and os.path.exists(outpath):
        print(f"Override path '{outpath}'")
        shutil.rmtree(outpath)
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    ### Load the model and dataset
    hash1, cfg1, m1 = SE.get_models(f"dataset.name={ds1} model.name={model}",
                                    load_models=True,
                                    return_hash=True)[0]
    test1: SingleCellOMIC = m1.test
    vae1: SingleCellModel = m1.model
    is_semi = vae1.is_semi_supervised
    if len(ds2) > 0:
        hash2, cfg2, m2 = SE.get_models(
            f"dataset.name={ds2} model.name={model}",
            load_models=True,
            return_hash=True)[0]
        test2: SingleCellOMIC = m2.test
        vae2: SingleCellModel = m2.model
    else:
        test2 = None
        vae2 = None
        cfg2 = None
        hash2 = None
    # Create the posterior
    kw = dict(batch_size=batch_size, verbose=True)
    # mapping from:
    if vae2 is None:
        posterior = Posterior(vae1, test1, name=f"{model}_{ds1}", **kw)
    else:
        posterior = Posterior(vae1, test2, name=f"{model}_{ds1}_{ds2}", **kw)
    ### running the evaluation
    train_ds = ds1
    test_ds = ds2
    with catch_warnings_ignore(UserWarning):
        # calculateing the scores
        if score_enable:
            robust_run("evaluate_scoring",
                       f"model:{model} train:{train_ds} test:{test_ds}",
                       scoring, posterior, outpath, train_ds, test_ds)
        # plotting the figures
        if plot_enable:
            robust_run("evaluate_plotting",
                       f"model:{model} train:{train_ds} test:{test_ds}",
                       plotting, posterior, outpath, train_ds, test_ds)
コード例 #30
0
ファイル: latent_benchmarks.py プロジェクト: trungnt13/sisua
def streamline_classifier(Z_train,
                          y_train,
                          Z_test,
                          y_test,
                          labels_name,
                          mode='ovr',
                          title='',
                          plot_train_results=False,
                          show_plot=True,
                          return_figure=False):
    r"""
  Arguments:
    fig : Figure or tuple (`float`, `float`), optional (default=`None`)
      width, height in inches

  Returns:
    (results_train, results_test), (fig_train, fig_test)
      results is a dictionary of scores
      {
        F1micro=f1_micro * 100,
        F1macro=f1_macro * 100,
        F1weight=f1_weight * 100,
        F1_[classname]=...
      }
  """
    mode = mode.strip().lower()
    assert mode in ('ovr', 'ovo'), \
    "Only support ovr - one vs rest, ovo - one vs one; mode for streamline classifier"

    labels_name = [standardize_protein_name(i) for i in labels_name]

    results_train = {}
    results_test = {}
    labels_name = np.array(labels_name)

    with catch_warnings_ignore(FutureWarning):
        with catch_warnings_ignore(RuntimeWarning):
            n_classes = len(labels_name)
            # ====== preprocessing ====== #
            if y_train.ndim == 1 or y_train.shape[1] == 1:
                y_train = one_hot(y_train.ravel(), nb_classes=n_classes)
            if y_test.ndim == 1 or y_test.shape[1] == 1:
                y_test = one_hot(y_test.ravel(), nb_classes=n_classes)
            is_binary_classes = sorted(np.unique(
                y_train.astype('float32'))) == [0., 1.]
            # ====== not binary classes ====== #
            if not is_binary_classes:
                gmm = ProbabilisticEmbedding()
                gmm.fit(np.concatenate((y_train, y_test), axis=0))
                y_train = gmm.predict(y_train)
                y_test = gmm.predict(y_test)
            # kernel : 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
            if mode == 'ovr':
                classifier = OneVsRestClassifier(SVC(
                    kernel='linear', random_state=UNIVERSAL_RANDOM_SEED),
                                                 n_jobs=n_classes)
                classifier.fit(X=Z_train, y=y_train)
            else:
                raise NotImplementedError
                classifier = SVC(kernel='linear',
                                 decision_function_shape='ovo',
                                 random_state=UNIVERSAL_RANDOM_SEED)
                classifier.fit(X=Z_train, y=y_train)
            # ====== return ====== #
            from sklearn.exceptions import UndefinedMetricWarning
            with catch_warnings_ignore(UndefinedMetricWarning):
                results_train = plot_evaluate_classifier(
                    y_pred=classifier.predict(Z_train),
                    y_true=y_train,
                    labels=labels_name,
                    title='[train]' + title,
                    show_plot=show_plot and plot_train_results,
                    return_figure=True)
                results_test = plot_evaluate_classifier(
                    y_pred=classifier.predict(Z_test),
                    y_true=y_test,
                    labels=labels_name,
                    title='[test]' + title,
                    show_plot=show_plot,
                    return_figure=True)

            if show_plot:
                if plot_train_results:
                    results_train, fig_train = results_train[0], results_train[
                        1]
                else:
                    fig_train = None
                results_test, fig_test = results_test[0], results_test[1]
            results_train = OrderedDict(
                sorted(results_train.items(), key=lambda x: x[0]))
            results_test = OrderedDict(
                sorted(results_test.items(), key=lambda x: x[0]))
            results = (results_train, results_test)

            if show_plot and return_figure:
                return results, (fig_train, fig_test)
            return results
コード例 #31
0
 def test_clustering(self):
     ds = get_dataset('8kmy')
     with catch_warnings_ignore(EfficiencyWarning):
         ds.clustering(algo='kmeans')
         ds.clustering(algo='knn')
コード例 #32
0
ファイル: analyze_data.py プロジェクト: imito/odin
             linestyle=linestyles[i % len(linestyles)],
             label=dsname)
  plt.legend()
  plt.suptitle("[%s]Mean" % title)

  V.plot_figure(nrow=6, ncol=20)
  for i, dsname in enumerate(all_dataset):
    _, std = _map[dsname]
    plt.plot(std,
             linewidth=1.,
             linestyle=linestyles[i % len(linestyles)],
             label=dsname)
  plt.legend()
  plt.suptitle("[%s]StandardDeviation" % title)

with catch_warnings_ignore(RuntimeWarning), catch_warnings_ignore(FutureWarning):
  data_map = {}
  stats_map = {}
  spk_map = {}
  for dsname, text, data, stats, spk_stats in mpi.MPI(jobs=all_dataset, func=dataset_statistics,
                            ncpu=None, batch=1):
    data_map[dsname] = data
    stats_map[dsname] = stats
    spk_map[dsname] = spk_stats
    print(text)

  for dsname in all_dataset:
    print("Plotting ...", ctext(dsname, 'cyan'))
    data = data_map[dsname]
    V.plot_figure(nrow=2, ncol=20)
    ax = plt.subplot(1, n_col, 1)
コード例 #33
0
ファイル: analyze_data.py プロジェクト: trungnt13/odin-ai
                 label=dsname)
    plt.legend()
    plt.suptitle("[%s]Mean" % title)

    V.plot_figure(nrow=6, ncol=20)
    for i, dsname in enumerate(all_dataset):
        _, std = _map[dsname]
        plt.plot(std,
                 linewidth=1.,
                 linestyle=linestyles[i % len(linestyles)],
                 label=dsname)
    plt.legend()
    plt.suptitle("[%s]StandardDeviation" % title)


with catch_warnings_ignore(RuntimeWarning), catch_warnings_ignore(
        FutureWarning):
    data_map = {}
    stats_map = {}
    spk_map = {}
    for dsname, text, data, stats, spk_stats in mpi.MPI(
            jobs=all_dataset, func=dataset_statistics, ncpu=None, batch=1):
        data_map[dsname] = data
        stats_map[dsname] = stats
        spk_map[dsname] = spk_stats
        print(text)

    for dsname in all_dataset:
        print("Plotting ...", ctext(dsname, 'cyan'))
        data = data_map[dsname]
        V.plot_figure(nrow=2, ncol=20)
コード例 #34
0
 def plot_series(self,
                 omic1=OMIC.transcriptomic,
                 omic2=OMIC.proteomic,
                 var_names1='auto',
                 var_names2='auto',
                 log1=True,
                 log2=True,
                 fontsize=10,
                 title='',
                 return_figure=False):
     r""" Plot lines of 2 OMICs sorted in ascending order of `omic1` """
     import seaborn as sns
     ## prepare
     omic1 = OMIC.parse(omic1)
     omic2 = OMIC.parse(omic2)
     omic1_ids = self.get_var_indices(omic1)
     omic2_ids = self.get_var_indices(omic2)
     if isinstance(var_names1, string_types) and var_names1 == 'auto':
         var_names1 = omic1.markers
     if isinstance(var_names2, string_types) and var_names2 == 'auto':
         var_names2 = omic2.markers
     ## filtering variables
     ids1 = []
     ids2 = []
     for v1, v2 in zip(var_names1, var_names2):
         i1 = omic1_ids.get(v1, None)
         i2 = omic2_ids.get(v2, None)
         if i1 is not None and i2 is not None:
             ids1.append(i1)
             ids2.append(i2)
     assert len(ids1) > 0, \
       (f"No variables found for omic1={omic1} var1={var_names1} "
        f"and omic2={omic2} var2={var_names2}")
     x1 = self.get_omic(omic1)[:, ids1]
     x2 = self.get_omic(omic2)[:, ids2]
     if log1:
         x1 = np.log1p(x1)
     if log2:
         x2 = np.log1p(x2)
     names1 = self.get_var_names(omic1)[ids1]
     names2 = self.get_var_names(omic2)[ids2]
     n_series = len(names1)
     ### prepare the plot
     colors = sns.color_palette(n_colors=2)
     fig = plt.figure(figsize=(12, n_series * 4))
     for idx in range(n_series):
         y1 = x1[:, idx]
         y2 = x2[:, idx]
         order = np.argsort(y1)
         ax = plt.subplot(n_series, 1, idx + 1)
         ## the second series
         ax.plot(y1[order],
                 linewidth=1.8,
                 color=colors[0],
                 label=f"{omic1.name}-{names1[idx]}")
         ax.set_ylabel(
             f"{'log' if log1 else 'raw'}-{omic1.name}-{names1[idx]}",
             color=colors[0])
         ax.set_xlabel(f"Cell in ascending order of {omic1.name}")
         ax.tick_params(axis='y', colors=colors[0], labelcolor=colors[0])
         ax.grid(False)
         ## the second series
         ax = ax.twinx()
         ax.plot(y2[order],
                 linestyle='--',
                 alpha=0.88,
                 linewidth=1.2,
                 color=colors[1])
         ax.set_ylabel(
             f"{'log' if log1 else 'raw'}-{omic2.name}-{names2[idx]}",
             color=colors[1])
         ax.tick_params(axis='y', colors=colors[1], labelcolor=colors[1])
         ax.grid(False)
     ### finalize the figure style
     if len(title) > 0:
         plt.suptitle(title, fontsize=fontsize + 2)
     with catch_warnings_ignore(UserWarning):
         plt.tight_layout(rect=[0., 0.02, 1., 0.98])
     if return_figure:
         return fig
     return self.add_figure(f'series_{omic1.name}_{omic2.name}', fig)