def from_scvi_model(cls, scvi_model: SCVI, adata: Optional[AnnData] = None): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. This model should have been trained on data comprising one lane. The adata object used to initialize this model should have only been setup with count data, i.e., no `batch_key`, `labels_key`, etc. adata Optional anndata to use that is compatible with scvi_model. Returns ------- SOLO model """ _validate_scvi_model(scvi_model) doublet_adata = cls.create_doublets(scvi_model.adata) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation() lib_size = scvi_model.get_latent_library_size(give_mean=give_mean_lib) latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): setup_anndata(doublet_adata) doublet_latent_rep = scvi_model.get_latent_representation( doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1)) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata)
def test_backed_anndata_scvi(save_path): adata = scvi.data.synthetic_iid() path = os.path.join(save_path, "test_data.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") setup_anndata(adata, batch_key="batch") model = SCVI(adata, n_latent=5) model.train(1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], 5) model.get_elbo()
def test_scvi_sparse(save_path): n_latent = 5 adata = synthetic_iid(run_setup_anndata=False) adata.X = csr_matrix(adata.X) setup_anndata(adata) model = SCVI(adata, n_latent=n_latent) model.train(1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) model.get_elbo() model.get_marginal_ll(n_mc_samples=3) model.get_reconstruction_error() model.get_normalized_expression() model.differential_expression(groupby="labels", group1="label_1")
def test_scvi(save_path): n_latent = 5 adata = synthetic_iid() model = SCVI(adata, n_latent=n_latent) model.train(1, check_val_every_n_epoch=1, train_size=0.5) model = SCVI(adata, n_latent=n_latent, var_activation=Softplus()) model.train(1, check_val_every_n_epoch=1, train_size=0.5) # tests __repr__ print(model) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) assert len(model.history["elbo_train"]) == 1 model.get_elbo() model.get_marginal_ll(n_mc_samples=3) model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") adata2 = synthetic_iid() model.get_elbo(adata2) model.get_marginal_ll(adata2, n_mc_samples=3) model.get_reconstruction_error(adata2) latent = model.get_latent_representation(adata2, indices=[1, 2, 3]) assert latent.shape == (3, n_latent) denoised = model.get_normalized_expression(adata2) assert denoised.shape == adata.shape denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch="batch_1" ) denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch=["batch_0", "batch_1"] ) assert denoised.shape == (3, adata2.n_vars) sample = model.posterior_predictive_sample(adata2) assert sample.shape == adata2.shape sample = model.posterior_predictive_sample( adata2, indices=[1, 2, 3], gene_list=["1", "2"] ) assert sample.shape == (3, 2) sample = model.posterior_predictive_sample( adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3 ) assert sample.shape == (3, 2, 3) model.get_feature_correlation_matrix(correlation_type="pearson") model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, ) model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, transform_batch=["batch_0", "batch_1"], ) params = model.get_likelihood_parameters() assert params["mean"].shape == adata.shape assert ( params["mean"].shape == params["dispersions"].shape == params["dropout"].shape ) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3]) assert params["mean"].shape == (3, adata.n_vars) params = model.get_likelihood_parameters( adata2, indices=[1, 2, 3], n_samples=3, give_mean=True ) assert params["mean"].shape == (3, adata.n_vars) model.get_latent_library_size() model.get_latent_library_size(adata2, indices=[1, 2, 3]) # test transfer_anndata_setup adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) model.get_elbo(adata2) # test automatic transfer_anndata_setup + on a view adata = synthetic_iid() model = SCVI(adata) adata2 = synthetic_iid(run_setup_anndata=False) model.get_elbo(adata2[:10]) # test that we catch incorrect mappings adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"]["mapping"] = np.array( ["label_4", "label_0", "label_2"] ) with pytest.raises(ValueError): model.get_elbo(adata2) # test that same mapping different order doesn't raise error adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"]["mapping"] = np.array( ["label_1", "label_0", "label_2"] ) model.get_elbo(adata2) # should automatically transfer setup # test mismatched categories raises ValueError adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True) with pytest.raises(ValueError): model.get_elbo(adata2) # test differential expression model.differential_expression(groupby="labels", group1="label_1") model.differential_expression( groupby="labels", group1="label_1", group2="label_2", mode="change" ) model.differential_expression(groupby="labels") model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5]) model.differential_expression(idx1=[0, 1, 2]) # transform batch works with all different types a = synthetic_iid(run_setup_anndata=False) batch = np.zeros(a.n_obs) batch[:64] += 1 a.obs["batch"] = batch setup_anndata(a, batch_key="batch") m = SCVI(a) m.train(1, train_size=0.5) m.get_normalized_expression(transform_batch=1) m.get_normalized_expression(transform_batch=[0, 1]) # test get_likelihood_parameters() when dispersion=='gene-cell' model = SCVI(adata, dispersion="gene-cell") model.get_likelihood_parameters() # test train callbacks work a = synthetic_iid() m = scvi.model.SCVI(a) lr_monitor = LearningRateMonitor() m.train( callbacks=[lr_monitor], max_epochs=10, log_every_n_steps=1, plan_kwargs={"reduce_lr_on_plateau": True}, ) assert "lr-Adam" in m.history.keys()
def test_scvi(): n_latent = 5 adata = synthetic_iid() model = SCVI(adata, n_latent=n_latent) model.train(1, frequency=1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) # len of history should be 2 since metrics is also run once at the very end after training assert len(model.history["elbo_train_set"]) == 2 model.get_elbo() model.get_marginal_ll() model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") adata2 = synthetic_iid() model.get_elbo(adata2) model.get_marginal_ll(adata2) model.get_reconstruction_error(adata2) latent = model.get_latent_representation(adata2, indices=[1, 2, 3]) assert latent.shape == (3, n_latent) denoised = model.get_normalized_expression(adata2) assert denoised.shape == adata.shape denoised = model.get_normalized_expression(adata2, indices=[1, 2, 3], transform_batch="batch_1") denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch=["batch_0", "batch_1"]) assert denoised.shape == (3, adata2.n_vars) sample = model.posterior_predictive_sample(adata2) assert sample.shape == adata2.shape sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"]) assert sample.shape == (3, 2) sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3) assert sample.shape == (3, 2, 3) model.get_feature_correlation_matrix(correlation_type="pearson") model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, ) model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, transform_batch=["batch_0", "batch_1"], ) params = model.get_likelihood_parameters() assert params["mean"].shape == adata.shape assert (params["mean"].shape == params["dispersions"].shape == params["dropout"].shape) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3]) assert params["mean"].shape == (3, adata.n_vars) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3], n_samples=3, give_mean=True) assert params["mean"].shape == (3, adata.n_vars) model.get_latent_library_size() model.get_latent_library_size(adata2, indices=[1, 2, 3]) # test transfer_anndata_setup adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) model.get_elbo(adata2) # test automatic transfer_anndata_setup + on a view adata = synthetic_iid() model = SCVI(adata) adata2 = synthetic_iid(run_setup_anndata=False) model.get_elbo(adata2[:10]) # test that we catch incorrect mappings adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"][ "mapping"] = np.array(["label_1", "label_0", "label_2"]) with pytest.raises(ValueError): model.get_elbo(adata2) # test mismatched categories raises ValueError adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True) with pytest.raises(ValueError): model.get_elbo(adata2) # test differential expression model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2", mode="change") model.differential_expression(groupby="labels") model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5]) model.differential_expression(idx1=[0, 1, 2]) # transform batch works with all different types a = synthetic_iid(run_setup_anndata=False) batch = np.zeros(a.n_obs) batch[:64] += 1 a.obs["batch"] = batch setup_anndata(a, batch_key="batch") m = SCVI(a) m.train(1, train_size=0.5) m.get_normalized_expression(transform_batch=1) m.get_normalized_expression(transform_batch=[0, 1])
def from_scvi_model( cls, scvi_model: SCVI, adata: Optional[AnnData] = None, restrict_to_batch: Optional[str] = None, doublet_ratio: int = 2, **classifier_kwargs, ): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. The adata object used to initialize this model should have only been setup with count data, and optionally a `batch_key`; i.e., no extra covariates or labels, etc. adata Optional anndata to use that is compatible with scvi_model. restrict_to_batch Batch category in `batch_key` used to setup adata for scvi_model to restrict Solo model to. This allows to train a Solo model on one batch of a scvi_model that was trained on multiple batches. doublet_ratio Ratio of generated doublets to produce relative to number of cells in adata or length of indices, if not `None`. **classifier_kwargs Keyword args for :class:`~scvi.module.Classifier` Returns ------- SOLO model """ _validate_scvi_model(scvi_model, restrict_to_batch=restrict_to_batch) orig_adata_manager = scvi_model.adata_manager orig_batch_key = orig_adata_manager.get_state_registry( REGISTRY_KEYS.BATCH_KEY).original_key if adata is not None: adata_manager = orig_adata_manager.transfer_setup(adata) cls.register_manager(adata_manager) else: adata_manager = orig_adata_manager adata = adata_manager.adata if restrict_to_batch is not None: batch_mask = adata.obs[orig_batch_key] == restrict_to_batch if np.sum(batch_mask) == 0: raise ValueError( "Batch category given to restrict_to_batch not found.\n" + "Available categories: {}".format( adata.obs[orig_batch_key].astype( "category").cat.categories)) # indices in adata with restrict_to_batch category batch_indices = np.where(batch_mask)[0] else: # use all indices batch_indices = None # anndata with only generated doublets doublet_adata = cls.create_doublets(adata_manager, indices=batch_indices, doublet_ratio=doublet_ratio) # if scvi wasn't trained with batch correction having the # zeros here does nothing. doublet_adata.obs[orig_batch_key] = ( restrict_to_batch if restrict_to_batch is not None else 0) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation( adata, indices=batch_indices) lib_size = scvi_model.get_latent_library_size(adata, indices=batch_indices, give_mean=give_mean_lib) latent_adata = AnnData( np.concatenate([latent_rep, np.log(lib_size)], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" orig_obs_names = adata.obs_names latent_adata.obs_names = (orig_obs_names[batch_indices] if batch_indices is not None else orig_obs_names) logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): scvi_model.setup_anndata(doublet_adata, batch_key=orig_batch_key) doublet_latent_rep = scvi_model.get_latent_representation( doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, np.log(doublet_lib_size)], axis=1)) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) cls.setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata, **classifier_kwargs)
def test_scvi(): n_latent = 5 adata = synthetic_iid() model = SCVI(adata, n_latent=n_latent) model.train(1) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) model.get_elbo() model.get_marginal_ll() model.get_reconstruction_error() model.get_normalized_expression() adata2 = synthetic_iid() model.get_elbo(adata2) model.get_marginal_ll(adata2) model.get_reconstruction_error(adata2) latent = model.get_latent_representation(adata2, indices=[1, 2, 3]) assert latent.shape == (3, n_latent) denoised = model.get_normalized_expression(adata2) assert denoised.shape == adata.shape denoised = model.get_normalized_expression(adata2, indices=[1, 2, 3], transform_batch=1) assert denoised.shape == (3, adata2.n_vars) sample = model.posterior_predictive_sample(adata2) assert sample.shape == adata2.shape sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"]) assert sample.shape == (3, 2) sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3) assert sample.shape == (3, 2, 3) model.get_feature_correlation_matrix(correlation_type="pearson") model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, ) params = model.get_likelihood_parameters() assert params["mean"].shape == adata.shape assert (params["mean"].shape == params["dispersions"].shape == params["dropout"].shape) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3]) assert params["mean"].shape == (3, adata.n_vars) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3], n_samples=3, give_mean=True) assert params["mean"].shape == (3, adata.n_vars) model.get_latent_library_size() model.get_latent_library_size(adata2, indices=[1, 2, 3]) # test transfer_anndata_setup adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) model.get_elbo(adata2) # test automatic transfer_anndata_setup + on a view adata = synthetic_iid() model = SCVI(adata) adata2 = synthetic_iid(run_setup_anndata=False) model.get_elbo(adata2[:10]) # test that we catch incorrect mappings adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"][ "mapping"] = pd.Index( data=["undefined_1", "undefined_0", "undefined_2"]) with pytest.raises(ValueError): model.get_elbo(adata2) # test mismatched categories raises ValueError adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True) with pytest.raises(ValueError): model.get_elbo(adata2) # test differential expression model.differential_expression(groupby="labels", group1="undefined_1") model.differential_expression(groupby="labels", group1="undefined_1", group2="undefined_2", mode="change") model.differential_expression(groupby="labels") model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5]) model.differential_expression(idx1=[0, 1, 2])
n_latent = int(args[6]) n_layers = int(args[7]) #vae = VAE(loom_dataset.nb_genes, n_batch=loom_dataset.n_batches, dropout_rate = 0.5, n_hidden = n_hidden, n_latent = n_latent, n_layers = n_layers, dispersion='gene-batch') #vae = VAE(loom_dataset.nb_genes, n_batch=loom_dataset.n_batches, n_latent = n_latent, dropout_rate = 0.1, dispersion='gene-batch') vae = SCVI(adata, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion='gene-batch', use_cuda=True) vae vae.train(n_epochs=n_epochs, n_epochs_kl_warmup=1) # extract latent = vae.get_latent_representation() normalized_values = vae.get_normalized_expression() with open(args[1] + ".csv", 'wb') as f: np.savetxt(f, latent, delimiter=",") if args[8] == 'IMPUTE': out = pd.DataFrame(imputed_values) out.columns = loom_dataset.gene_names out.index = loom_dataset.CellID out.to_hdf(args[1] + '.impute.hdf', key='index') if args[8] == 'NORM': out = pd.DataFrame(normalized_values) out.columns = loom_dataset.gene_names out.index = loom_dataset.CellID out.to_hdf(args[1] + '.norm.hdf', key='index')