def test_normalization(self): ds = get_dataset('8kmy') # ignore overflow warning with catch_warnings_ignore(RuntimeWarning): ds1 = ds.expm1(omic=OMIC.transcriptomic, inplace=False) ds2 = ds.expm1(omic=OMIC.proteomic, inplace=False) self.assertTrue(np.all(np.expm1(ds.X) == ds1.X)) self.assertTrue( np.all( np.expm1(ds.numpy(OMIC.proteomic)) == ds2.numpy( OMIC.proteomic))) ds1 = ds.normalize(OMIC.transcriptomic, inplace=False, log1p=True, scale=False, total=False) ds2 = ds.normalize(OMIC.proteomic, inplace=False, log1p=True, scale=False, total=False) self.assertTrue( np.all(ds1.numpy(OMIC.transcriptomic) == np.log1p(ds.X))) self.assertTrue( np.all(ds1.numpy(OMIC.proteomic) == ds.numpy(OMIC.proteomic))) self.assertTrue( np.all( ds2.numpy(OMIC.proteomic) == np.log1p(ds.numpy( OMIC.proteomic)))) self.assertTrue( np.all( ds2.numpy(OMIC.transcriptomic) == ds.numpy( OMIC.transcriptomic)))
def test_metrics(self): sco = get_dataset('8kmy') with catch_warnings_ignore(ConvergenceWarning): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() with sco._swap_omic('prot'): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() if _SCVI: sco = get_dataset('cortex') sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() with sco._swap_omic('cell'): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics()
def test_variational_model(self): sco = get_dataset(_DS) n_genes = sco.n_vars n_prots = sco.numpy(OMIC.proteomic).shape[1] vae = VariationalAutoEncoder(outputs=[ RandomVariable(dim=n_genes, posterior='zinb', name=OMIC.transcriptomic), RandomVariable(dim=n_prots, posterior='nbd', name=OMIC.proteomic) ]) vae.fit(sco, epochs=_EPOCHS, verbose=False) self._loss_not_rise(vae.train_history['loss']) self._loss_not_rise(vae.valid_history['val_loss']) X = sco.numpy()[:128] (pX, pY), qZ = vae.predict(X, sample_shape=2, verbose=False) self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated)) self.assertTrue( isinstance(pX.distribution.count_distribution, bay.distributions.NegativeBinomial)) self.assertTrue( isinstance(pY.distribution, bay.distributions.NegativeBinomialDisp)) self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0]) self.assertTrue(pY.batch_shape[0] == 2 and pY.batch_shape[1] == X.shape[0]) self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag)) self.assertTrue(qZ.sample().shape == (X.shape[0], vae.latents[0].event_shape[0]))
def create_posterior(self, test_sco: SingleCellOMIC = None, dropout_rate=0.2, retain_rate=0.2, corrupt_distribution='binomial', batch_size=8, sample_shape=10, reduce_latents=partial(tf.concat, axis=1), verbose=True, train_percent=0.8, random_state=1) -> Posterior: r""" Create a `Posterior` object for evaluation """ if not self.is_fitted: raise RuntimeError("fit() must be called before creating Posterior.") ### if isinstance(test_sco, SingleCellOMIC): test = test_sco elif self.dataset is None: raise ValueError( "Call SingleCellModel.set_metadata() to track the fitted dataset.") else: ds = get_dataset(self.dataset) _, test = ds.split(train_percent=train_percent, seed=random_state) ### return Posterior(scm=self, sco=test, dropout_rate=dropout_rate, retain_rate=retain_rate, corrupt_distribution=corrupt_distribution, batch_size=batch_size, sample_shape=sample_shape, reduce_latents=reduce_latents, verbose=verbose, name=f"{self.id}_{self.dataset}", random_state=random_state)
def test_visualization_celltype(self): sco = get_dataset('cortex') for X, var_names, rank_genes, clustering, dendrogram in itertools.product( ('cell', 'tran'), \ (None, 10), (0, 3), ('kmeans', 'louvain', None), (True, False)): if X == 'cell' and rank_genes > 0: continue # check louvain available if clustering == 'louvain': try: import louvain except ImportError: continue # plotting with catch_warnings_ignore(ignore_warnings): sco.plot_heatmap(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.plot_dotplot(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.plot_stacked_violins(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.save_figures('/tmp/tmp2.pdf')
def test_filters(self): ds = get_dataset('8kmy') ds1 = ds.filter_highly_variable_genes(inplace=False) ds2 = ds.filter_genes(inplace=False, min_counts=100) ds3 = ds.filter_cells(inplace=False, min_counts=1000) self.assertTrue(ds1.shape[1] == 999) self.assertTrue(np.min(ds2.X.sum(0)) == 100) self.assertTrue(np.min(ds3.X.sum(1)) == 1000)
def on_load_data(self, cfg): ds = cfg.dataset sco = get_dataset(ds.name) if cfg.verbose: print(sco) train, test = sco.split(train_percent=ds.train_percent) self.sco = sco self.train = train self.test = test
def preprocess(self): ds, gene_ds, prot_ds = get_dataset(dataset_name="pbmc_citeseq", override=False) expression_data = gene_ds.X gene_symbols = gene_ds.X_col self.gene_symbols = gene_symbols self.cell_names = gene_ds.X_row self.adt_expression = prot_ds.X self.protein_markers = prot_ds.X_col assert np.all(gene_ds.X_row == prot_ds.X_row) return expression_data
def get_arguments(): args = ArgController().add( "input", "Name of the dataset or path to csv file").add( "-n", "number of GMM components", 2).add("-idx", "index of the positive component", 1).add( "-norm", "method for normalizing: raw, log", 'log', ('log', 'raw')).add( "-outpath", "y_bin and y_prob will be saved to this path", '').add("-figpath", "path for saving analysis figure", '/tmp/tmp.pdf').add( "--verbose", "Enable verbose and saving diagnosis", False).parse() inp = str(args.input) if os.path.exists(inp): assert os.path.isfile(inp), "%s must be path to a file" % inp data = [] with open(inp, 'r') as f: for line in f: data.append(line.strip().split(',')) data = np.array(data) if all(is_number(i, string_number=True) for i in data[0]): y_prot = data.astype('float32') y_prot_names = np.array( ['#%d' % i for i in range(y_prot.shape[1])]) else: y_prot = data[1:].astype('float32') y_prot_names = data[0] outpath = args.outpath else: from sisua.data import get_dataset ds, gene_ds, prot_ds = get_dataset(inp, override=False) y_prot = ds['y'] y_prot_names = np.array(ds['y_col']) outpath = ds.path if args.outpath == '' else args.outpath return { 'y_prot': y_prot, 'y_prot_names': y_prot_names, 'n_components': int(args.n), 'index': int(args.idx), 'log_norm': True if args.norm == 'log' else False, 'outpath': outpath if len(outpath) > 0 else None, 'figpath': args.figpath if len(args.figpath) > 0 else None, 'verbose': bool(args.verbose) }
def test_corruption(self): ds = get_dataset('8kmy') ds1 = ds.corrupt(dropout_rate=0.25, inplace=False) ds2 = ds.corrupt(dropout_rate=0.5, inplace=False) ds3 = ds.corrupt(dropout_rate=0.5, inplace=False, omic=OMIC.proteomic) ds4 = ds.corrupt(dropout_rate=0.5, inplace=False, omic=OMIC.proteomic, distribution='uniform') self.assertTrue(ds.sparsity() < ds1.sparsity() < ds2.sparsity()) om = OMIC.proteomic self.assertTrue(ds.sparsity(om) < ds3.sparsity(om) < ds4.sparsity(om)) # multi corruption ds1 = ds.corrupt(omic=OMIC.transcriptomic | OMIC.proteomic, dropout_rate=0.5, inplace=False) self.assertTrue(\ ds1.sparsity(OMIC.transcriptomic) > ds.sparsity(OMIC.transcriptomic) and ds1.sparsity(OMIC.proteomic) > ds.sparsity(OMIC.proteomic))
def test_basic_functionalities(self): ds = get_dataset('8kmy') # split train, test = ds.split() self.assertEqual( set(train.cell_id) | set(test.cell_id), set(ds.cell_id)) # copy copy1 = ds.copy() # copy backed dataset copy2 = train.copy() # copy view dataset copy3 = ds.copy().apply_indices(test.indices) _equal(self, copy1, ds) _equal(self, copy2, train) _equal(self, copy3, test) # split again train1, test1 = ds.split() train.assert_matching_cells(train1) test.assert_matching_cells(test1) _equal(self, train, train1) _equal(self, test, test1)
def prepare(self): with catch_warnings_ignore(RuntimeWarning): sco = get_dataset('cortex') om1, om2 = sco.omics train, test = sco.split(train_percent=0.8, seed=1) n_gene = sco.numpy(om1).shape[1] n_prot = sco.numpy(om2).shape[1] rvs = [ RandomVariable(n_gene, 'zinbd', om1.name), RandomVariable(n_prot, 'onehot', om2.name) ] all_models = [ DeepCountAutoencoder, SCALE, SCVI, VariationalAutoEncoder ] all_configs = [ NetworkConfig(), NetworkConfig(pyramid=True), NetworkConfig(use_conv=True), NetworkConfig(pyramid=True, use_conv=True) ] return train, test, rvs, all_models, all_configs
def test_embedding(self): ds = get_dataset('8kmy') ds.probabilistic_embedding(OMIC.proteomic) prob = ds.probability() bina = ds.binary() self.assertTrue(np.all(np.logical_and(0. < prob, prob < 1.))) self.assertTrue(np.all(np.unique(bina) == np.unique([0., 1.]))) for algo in ('pca', 'tsne'): n = ds.n_obs pca1 = ds.dimension_reduce(n_components=2, algo=algo) pca2 = ds.dimension_reduce(OMIC.proteomic, n_components=3, algo=algo) self.assertTrue(pca1.shape == (n, 2)) self.assertTrue(pca2.shape == (n, 3) if algo == 'pca' else \ pca2.shape == (n, 2)) name1 = '%s_%s' % (OMIC.proteomic.name, algo) name2 = '%s_%s' % (OMIC.transcriptomic.name, algo) self.assertTrue(name1 in ds.obsm and name1 in ds.uns) self.assertTrue(name2 in ds.obsm and name2 in ds.uns)
def test_scvi(self): sco = get_dataset(_DS) train, test = sco.split() scvi = SCVI(RandomVariable(sco.n_vars, posterior='zinbd', name='rna')) scvi.fit(train, epochs=_EPOCHS, verbose=False) pX, (qZ, qL) = scvi.predict(test, verbose=False) self._loss_not_rise(scvi.train_history['loss']) self._loss_not_rise(scvi.valid_history['val_loss']) self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated)) self.assertTrue( isinstance(pX.distribution.count_distribution, bay.distributions.NegativeBinomialDisp)) self.assertTrue(pX.batch_shape[0] == 1 and pX.batch_shape[1] == test.n_obs) self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag)) self.assertTrue( qZ.sample(1).shape == (1, test.n_obs, scvi.latents[0].event_shape[0])) self.assertTrue(isinstance(qL.distribution, bay.distributions.Normal)) self.assertTrue(qL.sample(1).shape == (1, test.n_obs, 1))
def test_unsupervised_fit_predict(self): sco = get_dataset(_DS) train, test = sco.split() self.assertTrue(sco.n_omics >= 2) dca = DeepCountAutoencoder(outputs=RandomVariable(dim=sco.n_vars, posterior='mse'), latent_dim=10) dca.fit(train, epochs=_EPOCHS, verbose=False) dca.fit(train.numpy(), epochs=_EPOCHS, verbose=False) self._loss_not_rise(dca.train_history['loss']) self._loss_not_rise(dca.valid_history['val_loss']) pX, qZ = dca.predict(test, sample_shape=2, verbose=False) self.assertTrue(isinstance(pX, bay.distributions.VectorDeterministic)) self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == test.n_obs) self.assertTrue(isinstance(qZ, bay.distributions.VectorDeterministic)) X = sco.numpy()[:128] pX, qZ = dca.predict(X, sample_shape=2, verbose=False) self.assertTrue(isinstance(pX, bay.distributions.VectorDeterministic)) self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0]) self.assertTrue(isinstance(qZ, bay.distributions.VectorDeterministic))
def test_semi_supervised(self): sco = get_dataset(_DS) n_genes = sco.n_vars n_prots = sco.numpy(OMIC.proteomic).shape[1] sisua = SISUA(rna_dim=n_genes, adt_dim=n_prots, alternative_nb=True) sisua.fit(sco, epochs=_EPOCHS, verbose=False) self._loss_not_rise(sisua.train_history['loss']) self._loss_not_rise(sisua.valid_history['val_loss']) X = sco.numpy()[:128] (pX, pY), qZ = sisua.predict(X, sample_shape=2, verbose=False) self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated)) self.assertTrue( isinstance(pX.distribution.count_distribution, bay.distributions.NegativeBinomialDisp)) self.assertTrue( isinstance(pY.distribution, bay.distributions.NegativeBinomialDisp)) self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0]) self.assertTrue(pY.batch_shape[0] == 2 and pY.batch_shape[1] == X.shape[0]) self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag)) self.assertTrue( qZ.sample(1).shape == (1, X.shape[0], sisua.latents[0].event_shape[0]))
def extract_pca(p_train, p_test): # p_train, p_test : the output and latent distributions pca = [ fast_pca(squeeze(train.mean()), squeeze(test.mean()), n_components=2)[-1] for train, test in zip(p_train, p_test) if train.event_shape[0] > 1 ] return pca # =========================================================================== # Load data # =========================================================================== sco = get_dataset('cortex') train, test = sco.split(train_percent=0.8, seed=1) n_gene = sco.numpy(OMIC.transcriptomic).shape[1] n_prot = sco.numpy(OMIC.celltype).shape[1] gene_rv = RVmeta(n_gene, 'zinb', 'rna') prot_rv = RVmeta(n_prot, 'nb', 'adt') latent_dim = 10 all_models = [SCALE, SCVI, DeepCountAutoencoder, VariationalAutoEncoder] all_configs = [ NetConf(), NetConf(pyramid=True), NetConf(use_conv=True), NetConf(pyramid=True, use_conv=True) ]
def train_and_evaluate(ds_name, exp_name): from sisua.inference import InferenceSCVAE, InferenceSCVI, InferenceSISUA from sisua.analysis import Posterior, ResultsSheet ds, gene, prot = get_dataset(ds_name) # make sure gene expression stay the same assert np.all(gene.X_train == gene_eval.X_train) and \ np.all(gene.X_test == gene_eval.X_test) print("\n======== Running experiment ========") print("Training %d-proteins:" % len(prot.col_name), ', '.join([standardize_protein_name(i) for i in prot.col_name])) print("Testing %d-proteins:" % len(prot_eval.col_name), ', '.join([standardize_protein_name(i) for i in prot_eval.col_name])) n_prots = prot.feat_dim # ====== Main model training ====== # models = [ InferenceSCVAE(gene_dim=n_genes), InferenceSCVI(gene_dim=n_genes), InferenceSISUA(gene_dim=n_genes, prot_dim=n_prots), ] for m in models: m.fit(X=gene.X_train, y=prot.X_train if m.is_semi_supervised else None, corruption_rate=corruption_rate, corruption_dist=corruption_dist, n_epoch=n_epoch, batch_size=batch_size, detail_logging=False) # ====== evaluation ====== # pos = [Posterior(m, ds=eval_ds) for m in models] res = ResultsSheet(pos, verbose=True) res.plot_learning_curves( ).save_plots( os.path.join(FIGURE_PATH, 'learning_curves_%s.pdf' % exp_name)) res.plot_correlation_marker_pairs( ).save_plots( os.path.join(FIGURE_PATH, 'correlation8k_%s.pdf' % exp_name)) res.plot_latents_binary_scatter(test=False ).plot_latents_binary_scatter(test=True ).save_plots( os.path.join(FIGURE_PATH, 'latent8k_%s.pdf' % exp_name)) res.plot_scores(score_type='classifier' ).save_plots( os.path.join(FIGURE_PATH, 'classifier8k_%s.pdf' % exp_name)) # ====== cross ds ====== # pos = [Posterior(m, ds=cross_ds) for m in models] res = ResultsSheet(pos, verbose=True) res.plot_correlation_marker_pairs( ).save_plots( os.path.join(FIGURE_PATH, 'correlationECC_%s.pdf' % exp_name)) res.plot_latents_binary_scatter(test=False ).plot_latents_binary_scatter(test=True ).save_plots( os.path.join(FIGURE_PATH, 'latentECC_%s.pdf' % exp_name)) res.plot_scores(score_type='classifier' ).save_plots( os.path.join(FIGURE_PATH, 'classifierECC_%s.pdf' % exp_name))
from odin.utils import ArgController, stdio from odin.utils.mpi import MPI from sisua.analysis import Posterior from sisua.data import get_dataset from sisua.models.autoencoder import DeepCountAutoencoder from sisua.models.scvi_models import SCVI from sisua.models.semi_supervised import MultitaskAutoEncoder, multitaskVAE from sisua.models.variational_autoencoder import VariationalAutoEncoder # turn off TF logging and set reproducibile random seed os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.random.set_seed(8) np.random.seed(8) x, y = get_dataset('pbmc8kly') x_train, x_test = x.split() y_train, y_test = y.split() x_train.assert_matching_cells(y_train) x_test.assert_matching_cells(y_test) flags = ArgController().add('--no-train', 'Stop training', False).add('--no-score', 'Stop scoring', False).add('--analyze', "Analyzing", False).parse() no_train = flags.no_train no_score = flags.no_score analyze = flags.analyze # assume the scores were ready when analyze is enable if analyze: no_train = True
# ====== save path for all trained models ====== # path = '/tmp/pbmc8k_cellvdj' if os.path.exists(path) and override: print("Overriding path: %s" % path) shutil.rmtree(path) if not os.path.exists(path): os.mkdir(path) if os.path.isfile(path): raise ValueError("'%s' must be folder path" % path) # =========================================================================== # Train on PBMC8k-ly # =========================================================================== x, y = get_dataset('pbmc8kly') x_train, x_test = x.split() y_train, y_test = y.split() gene_name1 = x.var['geneid'] n_genes = x.shape[1] n_prot = y.shape[1] all_models = [ DeepCountAutoencoder(units=n_genes), SCVI(units=n_genes), VariationalAutoEncoder(units=n_genes), MultitaskVAE(units=[n_genes, n_prot]), MultitaskVI(units=[n_genes, n_prot]), ]
max_evals = 80 algorithm = 'bayes' freq = 1000 # mean that only run on_train_end path = '/tmp/autotune' if os.path.exists(path): shutil.rmtree(path) os.mkdir(path) # sc_metrics more robust to NaN values # TODO: accept a list of loss_name stdio(os.path.join(path, 'fit_hyper.txt')) # =========================================================================== # Cortext # =========================================================================== x, y = get_dataset('cortex') x.filter_cells(min_counts=1).filter_genes(min_counts=1) gene = x.shape[1] prot = y.shape[1] SCVI.fit_hyper(x, loss_name='nllk0', model_kwargs=dict(units=gene, xdist='zinbd'), fit_kwargs=dict(epochs=epochs, batch_size=batch_size, callbacks=[NegativeLogLikelihood(freq=freq)]), max_evals=max_evals, save_path=os.path.join(path, 'scvi_cortex'), algorithm=algorithm, verbose=True)
FIGURE_PATH = '/tmp/cross_datasets' corruption_rate = 0.25 corruption_dist = 'binomial' n_epoch = 1 batch_size = 128 if not os.path.exists(FIGURE_PATH): os.mkdir(FIGURE_PATH) # =========================================================================== # Load dataset # =========================================================================== all_datasets = { '8k': get_dataset('cross8k_ly'), 'ecc': get_dataset('crossecc_ly') } # ====== check gene expression is matching ====== # genes_name = None all_proteins = None for name, (ds, gene, prot) in all_datasets.items(): if genes_name is None: genes_name = gene.col_name else: assert np.all( gene.col_name == genes_name), "Set of training genes mis-match" prots_name = set([standardize_protein_name(i) for i in prot.col_name])
def filtering_experiment_path(ds_name, incl_keywords, excl_keywords, fn_filter=None, return_dataset=False, print_log=False, exp_path=''): r""" Parameters ---------- ds_name : string direct path to experiments folder or name of the dataset incl_keywords : string list of keywords for including the experiments (connect by ',') excl_keywords : string list of keywords for excluding the experiments (connect by ',') exp_path : string optional, if not given, use SISUA_EXP Return ------ dictionary corruption_config -> list of absolute path to all satisfied experiments Note ---- only finished experiments are select, i.e. the experiment folder contain 2 files 'config.pkl' and 'model.pkl' """ from sisua.data import EXP_DIR, get_dataset ds_name = str(ds_name) if exp_path is None: exp_path = '' exp_path = str(exp_path) if len(exp_path) == 0: exp_path = EXP_DIR assert os.path.isdir(exp_path), exp_path # ====== check the keywords ====== # if incl_keywords is None: incl_keywords = [] if excl_keywords is None: excl_keywords = [] if fn_filter is None: fn_filter = lambda keywords: True # ====== get the exp path ====== # if ds_name is None or return_dataset: (ds, gene_ds, prot_ds) = get_dataset(ds_name) ds_name = ds.name exp_path = os.path.join(exp_path, ds_name) assert os.path.exists(exp_path), "Experiment path '%s' must exists" % exp_path # ====== Extract all experiments ====== # all_exp = [] for name in os.listdir(exp_path): path = os.path.join(exp_path, name) # check if experiments finished if os.path.exists(os.path.join(path, 'model.pkl')): all_exp.append(path) all_exp = sorted(all_exp) # ====== start filtering ====== # if isinstance(incl_keywords, string_types): incl_keywords = [i for i in str(incl_keywords).split(',') if len(i) > 0] elif isinstance(incl_keywords, (tuple, list)): incl_keywords = as_tuple(incl_keywords, t=str) else: raise ValueError("No support for incl_keywords type: %s" % str(type(incl_keywords))) if isinstance(excl_keywords, string_types): excl_keywords = [i for i in str(excl_keywords).split(',') if len(i) > 0] elif isinstance(excl_keywords, (tuple, list)): excl_keywords = as_tuple(excl_keywords, t=str) else: raise ValueError("No support for excl_keywords type: %s" % str(type(excl_keywords))) all_exp = [ i for i in all_exp if all( any(j in keyword for keyword in os.path.basename(i).split('_')) for j in incl_keywords) ] all_exp = [ i for i in all_exp if all( all(j not in keyword for keyword in os.path.basename(i).split('_')) for j in excl_keywords) ] # filter function all_exp = [i for i in all_exp if fn_filter(os.path.basename(i).split('_'))] # ====== logging ====== # if bool(print_log): print(ctext("Found following experiments:", 'lightyellow')) for name, paths in all_exp.items(): print("*", ctext(name, 'yellow')) for i in paths: print(' ', os.path.basename(i)) if return_dataset: return all_exp, ds, gene_ds, prot_ds return all_exp
def cross_analyze(datasets, outpath, models, nprocess=1, verbose=False): from sisua.data import get_dataset from sisua.data.path import EXP_DIR from sisua.data.utils import standardize_protein_name assert nprocess > 0, "Number of processes must be greater than 0" datasets = as_tuple(datasets, t=string_types) assert len(datasets) > 1, \ "Require more than one datasets for cross analysis" if not os.path.exists(outpath): os.mkdir(outpath) models = as_tuple(models, t=string_types) assert len(models) > 0, \ "At least one model must be given" # ====== load datasets ====== # global all_datasets all_datasets = {name: get_dataset(name)[0] for name in datasets} all_datasets = [ (name, dict( X=ds['X'][:], X_col=ds['X_col'], X_row=ds['X_row'], y=ds['y'], y_col=np.array([standardize_protein_name(i) for i in ds['y_col']]), )) for name, ds in all_datasets.items() ] # ====== check gene expression is matching ====== # genes = all_datasets[0][1]['X_col'] for name, ds in all_datasets: assert np.all(ds['X_col'] == genes), "Set of training genes mis-match" # ====== get the list of all overlapping protein ====== # all_proteins = set(all_datasets[0][1]['y_col']) for name, ds in all_datasets: all_proteins &= set(ds['y_col']) all_proteins = sorted(all_proteins) # ====== only select certain protein ====== # if verbose: print("Datasets :", ctext(', '.join(datasets), 'yellow')) print("Model :", ctext(', '.join(models), 'yellow')) print("Shared proteins:", ctext(', '.join(all_proteins), 'yellow')) for name, ds in all_datasets: print(" ", ctext(name, 'cyan')) print(" X :", ds['X'].shape) print(" X_col:", ds['X_col']) print(" y :", ds['y'].shape) print(" y_col:", ', '.join(ds['y_col'])) # ====== load all the model ====== # all_models = [] for ds_name in datasets: if verbose: print("Search model for dataset '%s' ..." % ctext(ds_name, 'yellow')) exp_path = os.path.join(EXP_DIR, ds_name) for model_name in os.listdir(exp_path): if model_name.split('_')[0] in models: path = os.path.join(exp_path, model_name, 'model.pkl') if os.path.exists(path): all_models.append(path) if verbose: print(" ", ctext(model_name, 'cyan')) if verbose: print("%s datasets and %s models => %s experiments" % ( ctext(len(all_datasets), 'yellow'), ctext(len(all_models), 'yellow'), ctext(len(all_datasets) * len(all_models), 'yellow'), )) # ====== create all necessary dir in advance ====== # all_data_name = [i[0] for i in all_datasets] all_model_name = [i.split('/')[-3] for i in all_models] for name1, name2 in product(all_data_name, all_model_name): path = os.path.join( outpath, 'data%s_model%s' % (name1.replace('_', '').upper(), name2.replace('_', '').upper())) if not os.path.exists(path): os.mkdir(path) if verbose: print("Create output folder:", ctext(path, 'yellow')) # ====== start generate analysis ====== # processes = [] for ds_name, ds in all_datasets: y_true = { i: j for i, j in zip(ds['y_col'], ds['y'].T) if i in all_proteins } # preserve the same order of all_proteins y_true = np.hstack([y_true[i][:, np.newaxis] for i in all_proteins]) for model_path in all_models: processes.append( Process(target=_analyze, args=(ds_name, model_path, outpath, y_true, all_proteins, verbose))) if len(processes) >= nprocess: [p.start() for p in processes] [p.join() for p in processes] processes = [] # finish the remain processes if len(processes) > 0: [p.start() for p in processes] [p.join() for p in processes]
network = NetworkConfig(use_conv=True, pyramid=True, conv_proj=128) kl = interpolation.const(vmax=1) # kl = interpolation.linear(vmin=0, # vmax=10, # norm=20, # cyclical=True, # delayOut=5, # delayIn=5) # maximum amount of data points for testing (visualization) n_samples_visualization = 300 DS_NAME = 'pbmc8kly' # =========================================================================== # Load data # =========================================================================== gene, prot = get_dataset(DS_NAME) X_train, X_test = gene.split() y_train, y_test = prot.split() print("Labels:", prot.var) gene_rv = RandomVariable(gene.n_vars, posterior='zinbd', name='rna') prot_rv = RandomVariable(prot.n_vars, posterior='nb', name='adt') # ====== prepare the labels ====== # labels_name = standardize_protein_name(prot.var.iloc[:, 0].to_numpy()) if not y_test.is_binary: y_test.probabilistic_embedding() labels = np.argmax(y_test.obsm['X_prob'], axis=-1) else: labels = np.argmax(y_test.X, axis=-1) labels = np.array([labels_name[i] for i in labels])
def test_clustering(self): ds = get_dataset('8kmy') with catch_warnings_ignore(EfficiencyWarning): ds.clustering(algo='kmeans') ds.clustering(algo='knn')
FIGURE_PATH = '/tmp/missing_protein' corruption_rate = 0.25 corruption_dist = 'binomial' n_epoch = 200 batch_size = 128 if not os.path.exists(FIGURE_PATH): os.mkdir(FIGURE_PATH) # =========================================================================== # Load dataset # =========================================================================== # for evaluating ds_eval, gene_eval, prot_eval = get_dataset('cross8k_ly') # for evaluating cross-dataset ds_cross, gene_cross, prot_cross = get_dataset('crossecc_ly') n_genes = gene_eval.feat_dim eval_ds = dict( X_train=gene_eval.X_train, X_test=gene_eval.X_test, X_col=gene_eval.col_name, y_train=prot_eval.X_train, y_test=prot_eval.X_test, y_col=prot_eval.col_name) cross_ds = dict( X_train=gene_cross.X_train, X_test=gene_cross.X_test,
from sisua.analysis import Posterior from sisua.data import OMIC, get_dataset, standardize_protein_name from sisua.models import (SCVI, SISUA, DeepCountAutoencoder, NetworkConfig, RandomVariable, VariationalAutoEncoder) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' tf.random.set_seed(8) np.random.seed(8) # TODO: update this tutorial # =========================================================================== # Loading Data # =========================================================================== sco = get_dataset('8kly') print(sco) train, test = sco.split(train_percent=0.9) n_genes = sco.numpy(OMIC.transcriptomic).shape[1] n_prots = sco.numpy(OMIC.proteomic).shape[1] gene_omic = RandomVariable(n_genes, posterior='zinb', name='rna') prot_omic = RandomVariable(n_prots, posterior='nb', name='adt') network = NetworkConfig(nlayers=1, hidden_dim=64, pyramid=True, use_conv=False, input_dropout=0.) latent_dim = 12 epochs = 3 analytic = False
from __future__ import print_function, division, absolute_import from odin.stats import describe from sisua.data import get_dataset from sisua.label_threshold import ProbabilisticEmbedding # =========================================================================== # Load dataset # =========================================================================== FIGURE_PATH = '/tmp/tmp.pdf' ds, _, _ = get_dataset('pbmc_citeseq') protein = ds['y'] protein_name = ds['y_col'] print(protein.shape) print(protein_name) # =========================================================================== # Probabilistic Embedding # =========================================================================== pb = ProbabilisticEmbedding(n_components_per_class=2, positive_component=1, log_norm=True, clip_quartile=0., remove_zeros=True, ci_threshold=-0.68, random_state=5218, verbose=True) pb.fit(protein)