def test_concatenate_mixed(): X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) X3 = sparse.csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]])) X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2", "s3"], anno1=["c1", "c2", "c3"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(counts=X1), ) adata2 = AnnData( X2, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X4), # sic ) adata3 = AnnData( X3, dict(obs_names=["s7", "s8", "s9"], anno2=["d3", "d4", "d5"]), dict(var_names=["d", "c", "b"], annoA=[0, 2, 3], annoB=[0, 1, 2]), layers=dict(counts=X3), ) adata4 = AnnData( X4, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X2), # sic ) adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4) assert isinstance(adata_all.X, sparse.csr_matrix) assert isinstance(adata_all.layers["counts"], sparse.csr_matrix)
def test_scvi_linear(): n_samples = 4 n_genes = 7 batch1 = np.random.randint(1, 5, size=(n_samples, n_genes)) batch2 = np.random.randint(1, 5, size=(n_samples, n_genes)) ad1 = AnnData(batch1) ad2 = AnnData(batch2) adata = ad1.concatenate(ad2, batch_categories=['test1', 'test2']) n_latent = 30 gene_subset = ['1', '4', '6'] sce.pp.scvi( adata, use_cuda=False, n_epochs=1, n_latent=n_latent, return_posterior=True, batch_key='batch', linear_decoder=True, subset_genes=gene_subset, ) assert adata.obsm['X_scvi'].shape == (n_samples * 2, n_latent) assert adata.obsm['X_scvi_denoised'].shape == (n_samples * 2, len(gene_subset)) assert adata.obsm['X_scvi_sample_rate'].shape == (n_samples * 2, len(gene_subset)) assert adata.uns['ldvae_loadings'].shape == (len(gene_subset), n_latent) assert len(adata.uns['ldvae_loadings'].index) == len(gene_subset) assert set(adata.uns['ldvae_loadings'].index) == set(gene_subset)
def test_concatenate_mixed(): X1 = csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) X3 = csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]])) X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]) adata1 = AnnData( X1, dict(obs_names=['s1', 's2', 's3'], anno1=['c1', 'c2', 'c3']), dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), layers=dict(counts=X1), ) adata2 = AnnData( X2, dict(obs_names=['s4', 's5', 's6'], anno1=['c3', 'c4', 'c5']), dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), layers=dict(counts=X4), # sic ) adata3 = AnnData( X3, dict(obs_names=['s7', 's8', 's9'], anno2=['d3', 'd4', 'd5']), dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), layers=dict(counts=X3), ) adata4 = AnnData( X4, dict(obs_names=['s4', 's5', 's6'], anno1=['c3', 'c4', 'c5']), dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), layers=dict(counts=X2), # sic ) adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4) assert isspmatrix_csr(adata_all.X) assert isspmatrix_csr(adata_all.layers['counts'])
def load_normalized_data(file_path, log1p=True): """load normalized data 1. Load filtered data for both FACS and droplet 2. Size factor normalization to counts per 10 thousand 3. log(x+1) transform 4. Combine the data Args: file_path (str): file path. Should contain both FACS data facs_filtered.h5ad and droplet data droplet_filtered.h5ad Returns: adata_combine (AnnData): Combined data for FACS and droplet """ # Load filtered data adata_facs = read_h5ad(f'{file_path}/facs_filtered.h5ad') adata_droplet = read_h5ad(f'{file_path}/droplet_filtered.h5ad') # Size factor normalization sc.pp.normalize_per_cell(adata_facs, counts_per_cell_after=1e4) sc.pp.normalize_per_cell(adata_droplet, counts_per_cell_after=1e4) # log(x+1) transform if log1p: sc.pp.log1p(adata_facs) sc.pp.log1p(adata_droplet) # Combine the data ind_select = adata_facs.obs['age'].isin(['3m', '18m', '24m']) adata_facs = adata_facs[ind_select, ] adata_combine = AnnData.concatenate(adata_facs, adata_droplet, batch_key='b_method', batch_categories=['facs', 'droplet']) return adata_combine
def concat_data(data_list, batch_categories=None, join='inner', batch_key='batch', index_unique=None, save=None): """ Concat multiple datasets """ if len(data_list) == 1: return load_files(data_list[0]) elif isinstance(data_list, str): return load_files(data_list) adata_list = [] for root in data_list: adata = load_files(root) adata_list.append(adata) if batch_categories is None: batch_categories = list(map(str, range(len(adata_list)))) else: assert len(adata_list) == len(batch_categories) [print(b, adata.shape) for adata, b in zip(adata_list, batch_categories)] concat = AnnData.concatenate(*adata_list, join=join, batch_key=batch_key, batch_categories=batch_categories, index_unique=index_unique) if save: concat.write(save, compression='gzip') return concat
def map_query_data(cls, corrected_reference: AnnData, query: AnnData, reference_model: Union[str, 'scgen'], batch_key: str = 'study', return_latent = True): """ Removes the batch effect between reference and query data. Additional training on query data is not needed. Parameters ---------- corrected_reference: `~anndata.AnnData` Already corrected reference anndata object query: `~anndata.AnnData` Query anndata object batch_key: `str` batch label key in query.obs return_latent: `bool` if `True` returns corrected latent representation Returns ------- integrated: `~anndata.AnnData` Returns an integrated query. """ query_batches_labels = query.obs[batch_key].unique().tolist() query_adata_by_batches = [query[query.obs[batch_key].isin([batch])].copy() for batch in query_batches_labels] reference_query_adata = AnnData.concatenate(*[corrected_reference, query_adata_by_batches], batch_key="reference_map", batch_categories= ['reference'] + query_batches_labels, index_unique=None) reference_query_adata.obs['original_batch'] = reference_query_adata.obs[batch_key].tolist() # passed model as file if isinstance(reference_model, str): attr_dict, model_state_dict, var_names = cls._load_params(reference_model) _validate_var_names(query, var_names) init_params = cls._get_init_params_from_dict(attr_dict) new_model = cls(reference_query_adata, **init_params) new_model.model.load_state_dict(model_state_dict) integrated_query = new_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True) return integrated_query #passed model as model object else: # when corrected_reference is already in the passed model if np.all(reference_model._get_user_attributes()[0][1].X == corrected_reference.X): integrated_query = reference_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True) else: attr_dict = reference_model._get_public_attributes() model_state_dict = reference_model.model.state_dict() init_params = cls._get_init_params_from_dict(attr_dict) new_model = cls(reference_query_adata, **init_params) new_model.model.load_state_dict(model_state_dict) integrated_query = new_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True) return integrated_query
def project( new_dir_name: str, adata_ref: AnnData, scv: bool = False, batch_categories: Optional[str] = None, ) -> AnnData: """ This function is used to project adata to adata_ref using ingest function in scanpy. The procedures here are based on the tutorial: integration-data-using-inject. Both adata ad adata_ref should have been preprocessed by running the preprocess function but with need_scale false. :param new_dir_name: the data directory to be projected :param adata_ref: the reference data :param scv: true for scv-based RNA velocity analysis :param batch_categories: the name for the reference should be the first element :return: return a merged AnnData object with two originally data copied and merged. """ # Check if leiden has been conducted. if 'leiden' not in adata_ref.obs: return "error: run cluster first for the reference data." adata = None if scv: adata = scv_open(new_dir_name) scv_preprocess(adata) else: adata = open_10_genomics_data(new_dir_name) # Make sure we do the same thing as in the original data. But we don't want to keep the original data adata = preprocess(adata, copy=False, need_scale=False) # Check if we need to do permutation if imputation_uns_key_name in adata_ref.uns_keys(): # support magic only sc.external.pp.magic( adata, solver=adata_ref.uns[imputation_uns_key_name]['solver']) if 'min_value' in adata_ref.uns[imputation_uns_key_name].keys(): adata.X -= adata_ref.uns[imputation_uns_key_name][ 'min_value'] # Scale up as in the orginal data # Make sure both of them have the same set of variables shared_var_names = adata_ref.var_names.intersection(adata.var_names) sc.logging.info("shared_var_names: {}".format(len(shared_var_names))) # slicing the data to make copies adata = adata[:, shared_var_names] if not scv: # Call regress here so that we have almost the same number of genes selected by the adata_ref (aka highly invariable genes) regressout_key = None if regressout_uns_key_name in adata.uns_keys(): regressout_key = adata.uns[regressout_uns_key_name] sc.logging.info("Find regressout_keys for projecting: ", str(regressout_key)) regress(adata, keys=regressout_key) adata_ref = adata_ref[:, shared_var_names] # inject based on the leiden sc.tl.ingest(adata, adata_ref, obs='leiden') # merge two objects if not batch_categories: batch_categories = ['ref', 'new'] adata_merged = adata_ref.concatenate(adata, batch_categories=batch_categories) return adata_merged
def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix X1 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X2 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X3 = csr_matrix([[1, 2, 0], [0, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), dict(var_names=['a', 'b', 'c']), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), dict(var_names=['d', 'c', 'b']), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), dict(var_names=['d', 'c', 'b']), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] assert adata.X.toarray().astype(int).tolist() == X_combined assert adata.layers['Xs'].toarray().astype(int).tolist() == X_combined # outer join adata = adata1.concatenate(adata2, adata3, join='outer') assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0], ]
def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix X1 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X2 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X3 = csr_matrix([[1, 2, 0], [0, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s5", "s6"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] assert adata.X.toarray().astype(int).tolist() == X_combined assert adata.layers["Xs"].toarray().astype(int).tolist() == X_combined # outer join adata = adata1.concatenate(adata2, adata3, join="outer") assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0], ]
def test_concatenate_layers_outer(array_type, fill_val): # Testing that issue #368 is fixed a = AnnData( X=np.ones((10, 20)), layers={"a": array_type(sparse.random(10, 20, format="csr"))}, ) b = AnnData(X=np.ones((10, 20))) c = a.concatenate(b, join="outer", fill_value=fill_val, batch_categories=["a", "b"]) np.testing.assert_array_equal( asarray(c[c.obs["batch"] == "b"].layers["a"]), fill_val )
def load_files(root): """ Load single cell dataset from files """ if root.split('/')[-1] == '*': adata = [] for root in sorted(glob(root)): adata.append(load_file(root)) return AnnData.concatenate(*adata, batch_key='sub_batch', index_unique=None) else: return load_file(root)
def concat_data(data_list, batch_categories=None, join='inner', batch_key='batch', index_unique=None, save=None): """ Concatenate multiple datasets along the observations axis with name ``batch_key``. Parameters ---------- data_list A path list of AnnData matrices to concatenate with. Each matrix is referred to as a “batch”. batch_categories Categories for the batch annotation. By default, use increasing numbers. join Use intersection ('inner') or union ('outer') of variables of different batches. Default: 'inner'. batch_key Add the batch annotation to obs using this key. Default: 'batch'. index_unique Make the index unique by joining the existing index names with the batch category, using index_unique='-', for instance. Provide None to keep existing indices. save Path to save the new merged AnnData. Default: None. Returns ------- New merged AnnData. """ if len(data_list) == 1: return load_files(data_list[0]) elif isinstance(data_list, str): return load_files(data_list) adata_list = [] for root in data_list: adata = load_files(root) adata_list.append(adata) if batch_categories is None: batch_categories = list(map(str, range(len(adata_list)))) else: assert len(adata_list) == len(batch_categories) [print(b, adata.shape) for adata, b in zip(adata_list, batch_categories)] concat = AnnData.concatenate(*adata_list, join=join, batch_key=batch_key, batch_categories=batch_categories, index_unique=index_unique) if save: concat.write(save, compression='gzip') return concat
def test_concatenate_dense_duplicates(): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) # inner join duplicates adata1 = AnnData( X1, dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), dict( var_names=['a', 'b', 'c'], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata2 = AnnData( X2, dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), dict( var_names=['a', 'b', 'c'], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata3 = AnnData( X3, dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), dict( var_names=['a', 'b', 'c'], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.1], ), ) adata = adata1.concatenate(adata2, adata3) assert adata.var_keys() == [ 'annoA', 'annoB', 'annoC-0', 'annoD-0', 'annoC-1', 'annoD-1', 'annoD-2', ]
def from_scvi_model(cls, scvi_model: SCVI, adata: Optional[AnnData] = None): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. This model should have been trained on data comprising one lane. The adata object used to initialize this model should have only been setup with count data, i.e., no `batch_key`, `labels_key`, etc. adata Optional anndata to use that is compatible with scvi_model. Returns ------- SOLO model """ _validate_scvi_model(scvi_model) doublet_adata = cls.create_doublets(scvi_model.adata) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation() lib_size = scvi_model.get_latent_library_size(give_mean=give_mean_lib) latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): setup_anndata(doublet_adata) doublet_latent_rep = scvi_model.get_latent_representation( doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1)) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata)
def test_concatenate_dense_duplicates(): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) # inner join duplicates adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.1], ), ) adata = adata1.concatenate(adata2, adata3) assert adata.var_keys() == [ "annoA", "annoB", "annoC-0", "annoD-0", "annoC-1", "annoD-1", "annoD-2", ]
def train(self,adata_list,adj_list, l_list, num_pcs=50, lr=0.005, max_epochs=2000, weight_decay=0, opt="admin", init_spa=True, init="louvain", #louvain or kmeans n_neighbors=10, #for louvain n_clusters=None, #for kmeans res=0.4, #for louvain tol=1e-3): self.num_pcs=num_pcs self.res=res self.lr=lr self.max_epochs=max_epochs self.weight_decay=weight_decay self.opt=opt self.init_spa=init_spa self.init=init self.n_neighbors=n_neighbors self.n_clusters=n_clusters self.res=res self.tol=tol num_spots=0 for i in adata_list: num_spots+=i.shape[0] adj_exp_all=np.empty((num_spots, num_spots)) start=0 for i in range(len(l_list)): l=l_list[i] adj=adj_list[i] adj_exp=np.exp(-1*(adj**2)/(2*(l**2))) adj_exp_all[start:start+adj_exp.shape[0],start:start+adj_exp.shape[0]]=adj_exp start+=start+adj_exp.shape[0] self.adata_all=AnnData.concatenate(*adata_list,join='inner',batch_key="dataset_batch",batch_categories=["0","1"]) pca = PCA(n_components=self.num_pcs) if issparse(self.adata_all.X): pca.fit(self.adata_all.X.A) embed=pca.transform(self.adata_all.X.A) else: pca.fit(self.adata_all.X) embed=pca.transform(self.adata_all.X) #----------Train model---------- self.model=simple_GC_DEC(embed.shape[1],embed.shape[1]) self.model.fit(embed,adj_exp_all,lr=self.lr,max_epochs=self.max_epochs,weight_decay=self.weight_decay,opt=self.opt,init_spa=self.init_spa,init=self.init,n_neighbors=self.n_neighbors,n_clusters=self.n_clusters,res=self.res, tol=self.tol) self.embed=embed self.adj_exp=adj_exp_all
def test_load_timepoints_from_anndata_list(): adata_ref = sc.datasets.pbmc3k() start = [596, 615, 1682, 1663, 1409, 1432] adata = AnnData.concatenate( *(adata_ref[i : i + 1000] for i in start), join="outer", batch_key="sample", batch_categories=[f"sa{i}_Rep{j}" for i, j in product((1, 2, 3), (1, 2))], ) adata.obs["time_points"] = adata.obs["sample"].str.split("_", expand=True)[0] adata.obs["time_points"] = adata.obs["time_points"].astype("category") sc.pp.normalize_total(adata, target_sum=10000) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=1000, subset=True) sce.tl.harmony_timeseries(adata=adata, tp="time_points", n_components=None) assert all( [adata.obsp['harmony_aff'].shape[0], adata.obsp['harmony_aff_aug'].shape[0]] ), "harmony_timeseries augmented affinity matrix Error!"
def load_files(root): """ Load single cell dataset from files Parameters ---------- root the root store the single-cell data files, each file represent one dataset Return ------ AnnData """ if root.split('/')[-1] == '*': adata = [] for root in sorted(glob(root)): adata.append(load_file(root)) return AnnData.concatenate(*adata, batch_key='sub_batch', index_unique=None) else: return load_file(root)
def test_scvi(): n_samples = 4 n_genes = 7 batch1 = np.random.randint(1, 5, size=(n_samples, n_genes)) batch2 = np.random.randint(1, 5, size=(n_samples, n_genes)) ad1 = AnnData(batch1) ad2 = AnnData(batch2) adata = ad1.concatenate(ad2, batch_categories=['test1', 'test2']) n_latent = 30 sce.pp.scvi( adata, use_cuda=False, n_epochs=1, n_latent=n_latent, return_posterior=True, batch_key='batch', model_kwargs={'reconstruction_loss': 'nb'}, ) assert adata.obsm['X_scvi'].shape == (n_samples * 2, n_latent) assert adata.obsm['X_scvi_denoised'].shape == adata.shape assert adata.obsm['X_scvi_sample_rate'].shape == adata.shape
def batch_removal(self, adata: Optional[AnnData] = None) -> AnnData: """ Removes batch effects. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. Must have been setup with `batch_key` and `labels_key`, corresponding to batch and cell type metadata, respectively. Returns ------- corrected: `~anndata.AnnData` AnnData of corrected gene expression in adata.X and corrected latent space in adata.obsm["latent"]. A reference to the original AnnData is in `corrected.raw` if the input adata had no `raw` attribute. """ adata = self._validate_anndata(adata) latent_all = self.get_latent_representation(adata) # use keys registered from `setup_anndata()` cell_label_key = self.scvi_setup_dict_["categorical_mappings"]["_scvi_labels"][ "original_key" ] batch_key = self.scvi_setup_dict_["categorical_mappings"]["_scvi_batch"][ "original_key" ] adata_latent = AnnData(latent_all) adata_latent.obs = adata.obs.copy(deep=True) unique_cell_types = np.unique(adata_latent.obs[cell_label_key]) shared_ct = [] not_shared_ct = [] for cell_type in unique_cell_types: temp_cell = adata_latent[ adata_latent.obs[cell_label_key] == cell_type ].copy() if len(np.unique(temp_cell.obs[batch_key])) < 2: cell_type_ann = adata_latent[ adata_latent.obs[cell_label_key] == cell_type ] not_shared_ct.append(cell_type_ann) continue temp_cell = adata_latent[ adata_latent.obs[cell_label_key] == cell_type ].copy() batch_list = {} batch_ind = {} max_batch = 0 max_batch_ind = "" batches = np.unique(temp_cell.obs[batch_key]) for i in batches: temp = temp_cell[temp_cell.obs[batch_key] == i] temp_ind = temp_cell.obs[batch_key] == i if max_batch < len(temp): max_batch = len(temp) max_batch_ind = i batch_list[i] = temp batch_ind[i] = temp_ind max_batch_ann = batch_list[max_batch_ind] for study in batch_list: delta = np.average(max_batch_ann.X, axis=0) - np.average( batch_list[study].X, axis=0 ) batch_list[study].X = delta + batch_list[study].X temp_cell[batch_ind[study]].X = batch_list[study].X shared_ct.append(temp_cell) all_shared_ann = AnnData.concatenate( *shared_ct, batch_key="concat_batch", index_unique=None ) if "concat_batch" in all_shared_ann.obs.columns: del all_shared_ann.obs["concat_batch"] if len(not_shared_ct) < 1: corrected = AnnData( self.module.generative(torch.Tensor(all_shared_ann.X))["px"] .cpu() .numpy(), obs=all_shared_ann.obs, ) corrected.var_names = adata.var_names.tolist() corrected = corrected[adata.obs_names] if adata.raw is not None: adata_raw = AnnData(X=adata.raw.X, var=adata.raw.var) adata_raw.obs_names = adata.obs_names corrected.raw = adata_raw corrected.obsm["latent"] = all_shared_ann.X corrected.obsm["corrected_latent"] = self.get_latent_representation( corrected ) return corrected else: all_not_shared_ann = AnnData.concatenate( *not_shared_ct, batch_key="concat_batch", index_unique=None ) all_corrected_data = AnnData.concatenate( all_shared_ann, all_not_shared_ann, batch_key="concat_batch", index_unique=None, ) if "concat_batch" in all_shared_ann.obs.columns: del all_corrected_data.obs["concat_batch"] corrected = AnnData( self.module.generative(torch.Tensor(all_corrected_data.X))["px"] .cpu() .numpy(), obs=all_corrected_data.obs, ) corrected.var_names = adata.var_names.tolist() corrected = corrected[adata.obs_names] if adata.raw is not None: adata_raw = AnnData(X=adata.raw.X, var=adata.raw.var) adata_raw.obs_names = adata.obs_names corrected.raw = adata_raw corrected.obsm["latent"] = all_corrected_data.X corrected.obsm["corrected_latent"] = self.get_latent_representation( corrected ) return corrected
def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), layers=dict(Xs=X3), ) adata4 = AnnData( X4, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]), layers=dict(Xs=X4), ) adata1.raw = adata1 adata2.raw = adata2 adata3.raw = adata3 adata_all = AnnData.concatenate(adata1, adata2, adata3) assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == {"b", "c"} assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(adata_all.raw.X, adata_all.X) adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcd") assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) adata3.raw = adata4 adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") assert set(adata_all.var_names) == set("abcd") assert not np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) del adata3.raw with pytest.warns( UserWarning, match=("Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes."), ): adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None del adata1.raw del adata2.raw assert all(_adata.raw is None for _adata in (adata1, adata2, adata3)) adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None
def TCR_activation_10X(): ans = list() for sample, activation in zip(samples1[1:3], ["unstimulated", "stimulated"]): print(sample) output_prefix = pjoin("results", run, sample, sample) b = sc.read(output_prefix + ".filtered.h5ad") b.obs = b.obs.assign(activation=activation) ans.append(b) output_prefix = (run + "/PD2XX1_10xscRNA_Human_Tcells_2S3Qmixed.samples_joined") a = AnnData.concatenate(*ans) assert (np.asarray(a.var["external_gene_name-0"]) == np.asarray( a.var["external_gene_name-1"])).all() a.var["external_gene_name"] = a.var["external_gene_name-0"] a = a[np.random.choice(a.obs.index.tolist(), a.obs.shape[0], replace=False ), :] tech_attributes = [ "log_counts", "log_genes", "efficiency_ratio", "percent_mito", "percent_ribo", "percent_malat1", ] attributes = ["activation"] params = Params() params.plot_raw = False sc.pp.scale(a) sc.pp.pca(a) sc.pp.neighbors(a) sc.tl.umap(a) # Plot # marker genes (tailored for a PBMC sample) mark1 = [ "MALAT1", # sc "CD34", # HSC "CD3D", "CD3G", "CD247", # T-cell "CD4", "FOXP3", "CCR7", "CTLA4", # CD4 "CD8A", "NKG7", "IL2RA", # CD8 "NCAM1", "GNLY", # NK "CD14", "CST3", # Monocytes "CD79A", "CD19", "IGHG1", # B cells (also MS4A1) "FCER1G", "CLEC10A", # dendritic cells "SELE", "CD93", "PECAM1", "KDR", # endothelial cells "DCN", "COL6A2", # fibroblasts "GZMB", "CD68", "CD27", "MS4A1", "CD24", "NCR1", "CD274", ] # Immune mark2 = [ "IL32", "IFNG", "IFNGR1", "IL4R", "IL4", "JUN", "JUNB", "JUND", "JAK1", "JAK2", "GATA1", "JARID2", "KRAS", "MYC", ] mark3 = ["BTK", "LCK", "E2F4", "CXCR4", "ITGA4", "HBA1", "PTPRC"] red_mark = ["CST3", "TCL1A", "GZMB", "NKG7", "CD3D"] marker_genes = mark1 + mark2 + mark3 sc.pl.pca_variance_ratio(a, log=True, show=False) plt.gca().figure.savefig( output_prefix + ".single_cell.pca_variance_ratio.svg", dpi=300, bbox_inches="tight", ) # fig = sc.pl.pca(a, color=tech_attributes + attributes, components=['1,2', '2,3', '3,4', '4,5'], return_fig=True) # for ax in fig.axes: # ax.get_children()[0].set_rasterized(True) # fig.savefig(output_prefix + ".single_cell.pca.svg", dpi=300, bbox_inches="tight") fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True) for ax in fig.axes: ax.get_children()[0].set_rasterized(True) fig.savefig(output_prefix + ".single_cell.umap.svg", dpi=300, bbox_inches="tight") sc.pp.combat(a, "activation") sc.pp.scale(a) sc.pp.pca(a) sc.pp.neighbors(a) sc.tl.umap(a) fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True) for ax in fig.axes: ax.get_children()[0].set_rasterized(True) fig.savefig( output_prefix + ".single_cell.post_combat.umap.svg", dpi=300, bbox_inches="tight", ) sc.pp.regress_out(a, keys=["log_counts"]) sc.pp.scale(a) sc.pp.pca(a) sc.pp.neighbors(a) sc.tl.umap(a) fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True) for ax in fig.axes: ax.get_children()[0].set_rasterized(True) fig.savefig( output_prefix + ".single_cell.post_combat.post_regressout_counts.umap.svg", dpi=300, bbox_inches="tight", ) # sc.write(output_prefix + ".regress_out_counts.h5ad", a) # Now regress out also efficiency_ratio sc.pp.regress_out(a, keys=["efficiency_ratio"]) sc.pp.scale(a) sc.pp.pca(a) sc.pp.neighbors(a) sc.tl.umap(a) fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True) for ax in fig.axes: ax.get_children()[0].set_rasterized(True) fig.savefig( output_prefix + ".single_cell.post_combat.post_regressout_counts.post_regressout_efficiency.umap.svg", dpi=300, bbox_inches="tight", ) # sc.write(output_prefix + ".regress_out_counts.regress_out_efficiency_ratio.h5ad", a) if params.plot_raw: g = [ y for x in marker_genes for y in a.raw.var.loc[ a.raw.var["external_gene_name"] == x].index.tolist() ] else: g = [ x for x in marker_genes if x in a.var["external_gene_name"].tolist() ] color = tech_attributes + attributes + g kwargs = dict( hspace=0.1, wspace=0, return_fig=True, use_raw=params.plot_raw, gene_symbols="external_gene_name" if not params.plot_raw else None, ) fig = sc.pl.umap(a, color=color, **kwargs) for ax in fig.axes: ax.get_children()[0].set_rasterized(True) if params.plot_raw: for ax in fig.axes: try: ax.set_title(a.raw.var.loc[ax.get_title(), "external_gene_name"]) except KeyError: pass fig.savefig( output_prefix + ".single_cell.post_combat.post_regressout_counts.post_regressout_efficiency.markers_extended.umap.svg", dpi=300, bbox_inches="tight", ) # differntial expression max_genes = 500 attribute = "activation" attributes = [attribute] a.X += abs(a.X.min()) # # differential expression diff = differential_expression(a, attribute, n_genes=max_genes) diff.to_csv( output_prefix + f".{attribute}.cluster_comparison.top_values.csv", index=True, ) diff = pd.read_csv( output_prefix + f".{attribute}.cluster_comparison.top_values.csv", index_col=0, ) fig = plot_differential_expression(diff) fig.savefig( output_prefix + f".{attribute}.differential_expression.ma_plot.svg", dpi=300, bbox_inches="tight", ) # # differential enrichment groups = [x for x in diff["group"].unique() if x not in ["-1", -1]] enrichments = differential_enrichment( diff, groups, attribute, alpha=0.05, alpha_col="pvals_adj", max_n=max_genes, sort_by="scores", ) enrichments.to_csv(output_prefix + f"{attribute}.differential_enrichment.csv", index=False) enrichments = pd.read_csv(output_prefix + f"{attribute}.differential_enrichment.csv", index_col=0) g = (enrichments.set_index("description").groupby( ["group"])["combined_score"].nlargest(5)) print("combined_score:\n", g) g = (enrichments.set_index("description").groupby( ["group"])["p_value"].nsmallest(5)) print("p_value:\n", g) plot_differential_enrichment(enrichments, output_prefix, ntop_terms=20)
def mnn_correct(*datas, var_index=None, var_subset=None, batch_key='batch', index_unique='-', batch_categories=None, k=20, sigma=1., cos_norm_in=True, cos_norm_out=True, svd_dim=None, var_adj=True, compute_angle=False, mnn_order=None, svd_mode='rsvd', do_concatenate=True, save_raw=False, n_jobs=None, **kwargs): """ Apply MNN correct to input data matrices or AnnData objects. Depending on do_concatenate, returns matrices or AnnData objects in the original order containing corrected expression values, or concatenated matrices or AnnData object. :param datas: `numpy.ndarray` or class:`anndata.AnnData` Expression matrices or AnnData objects. Matrices should be shaped like n_obs * n_vars (n_cell * n_gene) and have consistent number of columns. AnnData objects should have same number of vars. :param var_index: `list` or `None`, optional (default: None) The index (list of str) of vars (genes). Necessary when using only a subset of vars to perform MNN correction, and should be supplied with var_subset. When datas are AnnData objects, var_index is ignored. :param var_subset: `list` or `None`, optional (default: None) The subset of vars (list of str) to be used when performing MNN correction. Typically, a list of highly variable genes (HVGs). When set to None, uses all vars. :param batch_key: `str`, optional (default: 'batch') The batch_key for AnnData.concatenate. Only valid when do_concatenate and supplying AnnData objects. :param index_unique: `str`, optional (default: '-') The index_unique for AnnData.concatenate. Only valid when do_concatenate and supplying AnnData objects. :param batch_categories: `list` or `None`, optional (default: None) The batch_categories for AnnData.concatenate. Only valid when do_concatenate and supplying AnnData objects. :param k: `int`, optional (default: 20) Number of mutual nearest neighbors. :param sigma: `float`, optional (default: 1) The bandwidth of the Gaussian smoothing kernel used to compute the correction vectors. :param cos_norm_in: `bool`, optional (default: True) Whether cosine normalization should be performed on the input data prior to calculating distances between cells. :param cos_norm_out: `bool`, optional (default: True) Whether cosine normalization should be performed prior to computing corrected expression values. :param svd_dim: `int` or `None`, optional (default: None) The number of dimensions to use for summarizing biological substructure within each batch. If set to None, biological components will not be removed from the correction vectors. :param var_adj: `bool`, optional (default: True) Whether to adjust variance of the correction vectors. Note this step takes most computing time. :param compute_angle: `bool`, optional (default: False) Whether to compute the angle between each cell’s correction vector and the biological subspace of the reference batch. :param mnn_order: `list` or `None`, optional (default: None) The order in which batches are to be corrected. When set to None, datas are corrected sequentially. :param svd_mode: `str`, optional (default: 'rsvd') One of 'svd', 'rsvd', and 'irlb'. 'svd' computes SVD using a non-randomized SVD-via-ID algorithm, while 'rsvd' uses a randomized version. 'irlb' performes truncated SVD by implicitly restarted Lanczos bidiagonalization (forked from https://github.com/airysen/irlbpy). :param do_concatenate: `bool`, optional (default: True) Whether to concatenate the corrected matrices or AnnData objects. Default is True. :param save_raw: `bool`, optional (default: False) Whether to save the original expression data in the .raw attribute of AnnData objects. :param n_jobs: `int` or `None`, optional (default: None) The number of jobs. When set to None, automatically uses the number of cores. :param kwargs: `dict` or `None`, optional (default: None) optional keyword arguments for irlb. :return: datas: `numpy.ndarray` or class:`anndata.AnnData` Corrected matrix/matrices or AnnData object/objects, depending on the input type and do_concatenate. mnn_list_: `list` A list containing MNN pairing information as DataFrames in each iteration step. angle_list_: `list` A list containing angles of each batch. """ if len(datas) < 2: return datas n_batch = len(datas) if mnn_order is not None: if sorted(mnn_order) != list(range(n_batch)): raise ValueError( 'The argument mnn_order should contain values in 1:' + 'n_batch' + '.') if isinstance(datas[0], AnnData): if var_index is not None: print('Inputs are AnnData objects, var_index ignored.') n_batch = len(datas) adata_vars = datas[0].var.index for i in range(1, n_batch): if (datas[i].var.index != adata_vars).any(): raise ValueError( 'The AnnData objects have inconsistent number of vars.') if var_subset is not None and set(adata_vars) == set(var_subset): var_subset = None corrected = mnn_correct(*(adata.X for adata in datas), var_index=adata_vars, var_subset=var_subset, k=k, sigma=sigma, cos_norm_in=cos_norm_in, cos_norm_out=cos_norm_out, svd_dim=svd_dim, var_adj=var_adj, compute_angle=compute_angle, mnn_order=mnn_order, svd_mode=svd_mode, do_concatenate=do_concatenate, **kwargs) print('Packing AnnData object...') if do_concatenate: adata = AnnData.concatenate(*datas, batch_key=batch_key, batch_categories=batch_categories, index_unique=index_unique) if save_raw: adata.raw = adata.copy() adata.X = corrected[0] print('Done.') return adata, corrected[1], corrected[2] else: for adata, new_matrix in zip(datas, corrected[0]): if save_raw: adata.raw = adata.copy() adata.X = new_matrix print('Done.') return datas, corrected[1], corrected[2] # ------------------------------------------------------------ if n_jobs is None: n_jobs = cpu_count() n_cols = datas[0].shape[1] if len(var_index) != n_cols: raise ValueError( 'The number of vars is not equal to the length of var_index.') for i in range(1, n_batch): if datas[i].shape[1] != n_cols: raise ValueError( 'The input matrices have inconsistent number of columns.') # ------------------------------------------------------------ print('Performing cosine normalization...') in_batches, out_batches, var_subset, same_set = transform_input_data( datas, cos_norm_in, cos_norm_out, var_index, var_subset, n_jobs) if mnn_order is None: mnn_order = list(range(n_batch)) ref = mnn_order[0] ref_batch_in = in_batches[ref] if not same_set: ref_batch_out = out_batches[ref] res_container = [out_batches[ref]] mnn_container = [0] angle_container = [0] original_batch = [ref] * ref_batch_in.shape[0] print('Starting MNN correct iteration. Reference batch: ' + str(ref)) # ------------------------------------------------------------ # loop through batches for step in range(1, n_batch): target = mnn_order[step] print('Step ' + str(step) + ' of ' + str(n_batch - 1) + ': processing batch ' + str(target)) new_batch_in = in_batches[target] if not same_set: new_batch_out = out_batches[target] print(' Looking for MNNs...') mnn_ref, mnn_new = find_mutual_nn(data1=ref_batch_in, data2=new_batch_in, k1=k, k2=k, n_jobs=n_jobs) print(' Computing correction vectors...') correction_in = compute_correction(ref_batch_in, new_batch_in, mnn_ref, mnn_new, new_batch_in, sigma) if not same_set: correction_out = compute_correction(ref_batch_out, new_batch_out, mnn_ref, mnn_new, new_batch_in, sigma) if compute_angle: print(' Computing angle...') ref_centred = ref_batch_in - np.mean(ref_batch_in, axis=0) ref_basis = svd_internal(ref_centred.T, nu=2, svd_mode=svd_mode, **kwargs) find_subspace_job = partial(find_shared_subspace, mat1=ref_basis, mat2_vec=True) with Pool(n_jobs) as p_n: angle_out = p_n.map(find_subspace_job, correction_in) angle_container.append(angle_out) # ------------------------ if svd_dim is not None and svd_dim != 0: print(' Removing components...') mnn_ref_u = np.unique(mnn_ref) mnn_new_u = np.unique(mnn_new) in_span_ref = get_bio_span(ref_batch_in[mnn_ref_u, :], ndim=svd_dim, svd_mode=svd_mode, **kwargs) in_span_new = get_bio_span(new_batch_in[mnn_new_u, :], ndim=svd_dim, svd_mode=svd_mode, **kwargs) correction_in = subtract_bio(in_span_ref, in_span_new, correction=correction_in) if not same_set: out_span_ref = get_bio_span(ref_batch_out[mnn_ref_u, :], ndim=svd_dim, svd_mode=svd_mode, var_subset=var_subset, **kwargs) out_span_new = get_bio_span(new_batch_out[mnn_new_u, :], ndim=svd_dim, svd_mode=svd_mode, var_subset=var_subset, **kwargs) correction_out = subtract_bio(out_span_ref, out_span_new, correction=correction_out, var_subset=var_subset) # ------------------------ if var_adj: print(' Adjusting variance...') correction_in = adjust_shift_variance(ref_batch_in, new_batch_in, correction_in, sigma, n_jobs) if not same_set: correction_out = adjust_shift_variance(ref_batch_out, new_batch_out, correction_out, sigma, n_jobs, var_subset) # ------------------------ print(' Applying correction...') new_batch_in = new_batch_in + correction_in ref_batch_in = np.concatenate((ref_batch_in, new_batch_in)) if same_set: res_container.append(new_batch_in) else: new_batch_out = new_batch_out + correction_out ref_batch_out = np.concatenate((ref_batch_out, new_batch_out)) res_container.append(new_batch_out) mnn_container.append( DataFrame({ 'new cell': mnn_new, 'ref cell': mnn_ref, 'original batch': [original_batch[mnn] for mnn in mnn_ref] })) original_batch += [target] * new_batch_in.shape[0] print('MNN correction complete. Gathering output...') reflow_order = [0] * n_batch for i in range(n_batch): reflow_order[mnn_order[i]] = i results_ = [np.array(res_container[i]) for i in reflow_order] mnn_list_ = [mnn_container[i] for i in reflow_order] angle_list_ = [angle_container[i] for i in reflow_order] if compute_angle else None if do_concatenate: results_ = np.concatenate(tuple(results_)) return results_, mnn_list_, angle_list_
def label_transfer(adata: AnnData, adata_ref: Optional[AnnData] = None, obs: Optional[str] = None, label_unk: Optional[str] = 'unknown', use_best: Optional[bool] = False, neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[sparse.spmatrix] = None, directed: bool = False, use_weights: bool = False, pca_args: Optional[dict] = {}, use_rep: Optional[str] = None, harmony_args: Optional[dict] = {}, copy: bool = False) -> Optional[AnnData]: """\ Transfer annotation from one dataset to another using cell affinities. If two datasets are given, it uses harmony to perform integration and then the kNN graph. If only no reference is given, it is assumed that the only adata already contains the proper kNN graph and that labels to be reassigned have a specified value. Parameters ---------- adata: The AnnData object. adata_ref The optional reference dataset. If None, then all the needed information should be included in `adata` (i.e. the kNN graph and the labels) obs The label that needs to be transfered. Should be in `adata_ref.obs` or in `adata.obs` if no `adata_ref` is given label_unk The label for unassigned cells. If no `adata_ref` is given, this label identifies cells to be assigned in `adata`. If `adata_ref` is given, this label will be given to all cells that cannot be assigned. use_best When assigning labels, some cells may have not enough evidence and, therefore, left `unknown`. If this parameter is set to `True`, all cells will be assigned to the best possible, even if it may not be optimal neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). pca_args Parameters to be passed to `sc.tl.pca` before harmony is issued use_rep If specified use this embedding and do not calculate a pca. Note that the embedding must be present in both datasets, with the same number of dimensions harmony_args Parameters to be passed to `sc.external.pp.harmony_integrate` copy: Return a new object or do everything in place Returns ------- Depending on `copy`, returns or updates `adata` with added labels in adata.obs[f'{label_ref}'] """ adata = adata.copy() if copy else adata if adata_ref: from scanpy.tools import pca from scanpy.preprocessing import neighbors from scanpy.external.pp import harmony_integrate # we have to create a merged dataset and integrate # before that check that the labels are not in the recipient, in case drop if obs in adata.obs_keys(): logg.warning(f'{obs} was found in dataset 1, it will be wiped') adata.obs.drop(obs, inplace=True, axis='columns') if not obs in adata_ref.obs_keys(): raise ValueError( f'Annotation {obs} is not present in reference dataset.') if use_rep: revert_to_pca = False if not use_rep in adata.obsm.keys(): logg.warning( f'{use_rep} was not found into dataset 1, reverting to PCA' ) revert_to_pca = True elif not use_rep in adata_ref.obsm.keys(): logg.warning( f'{use_rep} was not found into dataset 2, reverting to PCA' ) revert_to_pca = True elif adata.obsm[use_rep].shape[1] != adata_ref.obsm[use_rep].shape[ 1]: logg.warning( f'{use_rep} is inconsistent in two datasets, reverting to PCA' ) revert_to_pca = True if revert_to_pca: use_rep = None # now do the merge, so that the empty category is now created adata_merge = adata.concatenate(adata_ref, batch_categories=['_unk', '_ref'], batch_key='_label_transfer') # if adata_merge.obs[obs].dtype.name != 'category': adata_merge.obs[obs] = pd.Categorical(adata_merge.obs[obs]) adata_merge.obs[obs] = adata_merge.obs[obs].cat.add_categories( label_unk).fillna(label_unk) # perform integration using harmony if not use_rep: pca(adata_merge, **pca_args) use_rep = 'X_pca' h_rep = f'{use_rep}_harmony' harmony_integrate(adata_merge, key='_label_transfer', basis=use_rep, adjusted_basis=h_rep, **harmony_args) # now calculate the kNN graph n_neighbors = int(np.sqrt(adata_merge.shape[0]) / 2) key_added = neighbors_key if key_added == 'neighbors': key_added = None neighbors(adata_merge, use_rep=h_rep, n_neighbors=n_neighbors, key_added=key_added) else: adata_merge = adata #.copy() if not obs in adata_merge.obs_keys(): raise ValueError(f'Annotation {obs} is not present in dataset.') if not label_unk in adata_merge.obs[obs].cat.categories: raise ValueError(f'Label {label_unk} is not present in {obs}.') # calculate affinity calculate_affinity(adata_merge, group_by=obs, neighbors_key=neighbors_key) # now work on affinity, rank it to get the new labels categories = adata_merge.obs[obs].cat.categories affinity = pd.DataFrame(adata_merge.obsm[f'CA_{obs}'], index=adata_merge.obs_names, columns=categories) # if use_best we need to remove label unknonw from the matrix so it # does not get scored if use_best: affinity.drop(label_unk, axis='columns', inplace=True) rank_affinity = affinity.rank(axis=1, ascending=False) adata_merge.obs[f'_{obs}_tmp'] = adata_merge.obs[obs].values for c in rank_affinity.columns: # pretty sure there's a way to do it without a # for loop :-/ I really need a course on pandas cells = rank_affinity[rank_affinity[c] == 1].index adata_merge.obs.loc[cells, f'_{obs}_tmp'] = c # do actual transfer to dataset 1 # here we assume that concatenation does not change the order of cells # only cell names labels = adata_merge.obs[f'_{obs}_tmp'].cat.categories if adata_ref: # transfer has been done between two files adata.obs[obs] = adata_merge.obs.query( '_label_transfer == "_unk"')[f'_{obs}_tmp'].values else: # transfer is within dataset adata_merge.obs[obs] = adata_merge.obs[f'_{obs}_tmp'].values adata_merge.obs.drop(f'_{obs}_tmp', axis='columns', inplace=True) adata = adata_merge # ensure that it is categorical with proper order adata.obs[obs] = pd.Categorical(adata.obs[obs], categories=labels) # transfer colors if any if adata_ref and f'{obs}_colors' in adata_ref.uns: colors = list(adata_ref.uns[f'{obs}_colors']) if not use_best: # add gray for unknown colors.append('#aabbcc') adata.uns[f'{obs}_colors'] = colors # remove unused categories if "use_best" hence no "unknown" if use_best: adata.obs[obs].cat.remove_unused_categories() return adata if copy else None
def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers={"Xs": X2}, ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.X.astype(int).tolist() == X_combined assert adata.layers["Xs"].astype(int).tolist() == X_combined assert adata.obs_keys() == ["anno1", "anno2", "batch"] assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] assert adata.obsm_keys() == ["X_1", "X_2"] assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist() # with batch_key and batch_categories adata = adata1.concatenate(adata2, adata3, batch_key="batch1") assert adata.obs_keys() == ["anno1", "anno2", "batch1"] adata = adata1.concatenate(adata2, adata3, batch_categories=["a1", "a2", "a3"]) assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"] assert adata.var_names.tolist() == ["b", "c"] # outer join adata = adata1.concatenate(adata2, adata3, join="outer") X_ref = np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], ]) np.testing.assert_equal(adata.X, X_ref) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array([ [0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0], ])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), layers={'Xs': X2}, ) adata3 = AnnData( X3, dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), dict(var_names=['d', 'c', 'b'], annoB=[0, 1, 2]), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.X.astype(int).tolist() == X_combined assert adata.layers['Xs'].astype(int).tolist() == X_combined assert adata.obs_keys() == ['anno1', 'anno2', 'batch'] assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2'] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] adata = adata1.concatenate(adata2, adata3, batch_key='batch1') assert adata.obs_keys() == ['anno1', 'anno2', 'batch1'] adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3']) assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3'] assert adata.var_names.tolist() == ['b', 'c'] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid( np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], ])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array([ [0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0], ])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
def test_concatenate(): # dense data adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c'], 'annoA': [0, 1, 2]}) adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b'], 'annoA': [0, 1, 2]}) adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b'], 'annoB': [0, 1, 2]}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.obs_keys() == ['anno1', 'anno2', 'batch'] assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2'] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] adata = adata1.concatenate(adata2, adata3, batch_key='batch1') assert adata.obs_keys() == ['anno1', 'anno2', 'batch1'] adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3']) assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3'] assert adata.var_names.tolist() == ['b', 'c'] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid(np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0]])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid(np.array( [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) # sparse data from scipy.sparse import csr_matrix adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c']}) adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b']}) adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]), {'obs_names': ['s5', 's6'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b']}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0]]
def from_scvi_model( cls, scvi_model: SCVI, adata: Optional[AnnData] = None, restrict_to_batch: Optional[str] = None, doublet_ratio: int = 2, **classifier_kwargs, ): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. The adata object used to initialize this model should have only been setup with count data, and optionally a `batch_key`; i.e., no extra covariates or labels, etc. adata Optional anndata to use that is compatible with scvi_model. restrict_to_batch Batch category in `batch_key` used to setup adata for scvi_model to restrict Solo model to. This allows to train a Solo model on one batch of a scvi_model that was trained on multiple batches. doublet_ratio Ratio of generated doublets to produce relative to number of cells in adata or length of indices, if not `None`. **classifier_kwargs Keyword args for :class:`~scvi.module.Classifier` Returns ------- SOLO model """ _validate_scvi_model(scvi_model, restrict_to_batch=restrict_to_batch) orig_adata_manager = scvi_model.adata_manager orig_batch_key = orig_adata_manager.get_state_registry( REGISTRY_KEYS.BATCH_KEY).original_key if adata is not None: adata_manager = orig_adata_manager.transfer_setup(adata) cls.register_manager(adata_manager) else: adata_manager = orig_adata_manager adata = adata_manager.adata if restrict_to_batch is not None: batch_mask = adata.obs[orig_batch_key] == restrict_to_batch if np.sum(batch_mask) == 0: raise ValueError( "Batch category given to restrict_to_batch not found.\n" + "Available categories: {}".format( adata.obs[orig_batch_key].astype( "category").cat.categories)) # indices in adata with restrict_to_batch category batch_indices = np.where(batch_mask)[0] else: # use all indices batch_indices = None # anndata with only generated doublets doublet_adata = cls.create_doublets(adata_manager, indices=batch_indices, doublet_ratio=doublet_ratio) # if scvi wasn't trained with batch correction having the # zeros here does nothing. doublet_adata.obs[orig_batch_key] = ( restrict_to_batch if restrict_to_batch is not None else 0) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation( adata, indices=batch_indices) lib_size = scvi_model.get_latent_library_size(adata, indices=batch_indices, give_mean=give_mean_lib) latent_adata = AnnData( np.concatenate([latent_rep, np.log(lib_size)], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" orig_obs_names = adata.obs_names latent_adata.obs_names = (orig_obs_names[batch_indices] if batch_indices is not None else orig_obs_names) logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): scvi_model.setup_anndata(doublet_adata, batch_key=orig_batch_key) doublet_latent_rep = scvi_model.get_latent_representation( doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, np.log(doublet_lib_size)], axis=1)) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) cls.setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata, **classifier_kwargs)
def fit( self, source_data, #adata target_data, #adata batch_size=256, maxiter=1000, pretrain_epochs=300, epochs_fit=5, tol=[0.001], alpha=[1.0], resolution=[0.2, 0.4, 0.8, 1.2, 1.6], n_neighbors=20, softmax=False, init="glorot_uniform", save_atr="isy_trans_True"): ''' Fit the transfer learning model using provided data. This function includes preprocessing steps. Input: source_data(anndata format), target_data(anndata format). Source and target data can be in any form (UMI or TPM or FPKM) Retrun: No return ''' self.batch_size = batch_size self.maxiter = maxiter self.pretrain_epochs = pretrain_epochs self.epochs_fit = epochs_fit self.tol = tol self.alpha = alpha self.source_data = source_data self.target_data = target_data self.resolution = resolution self.n_neighbors = n_neighbors self.softmax = softmax self.init = init self.save_atr = save_atr dictionary = {"alpha": alpha, "tol": tol, "resolution": resolution} df_expand = expand_grid(dictionary) #begin to conduct adata_tmp = [] source_data.var_names_make_unique(join="-") source_data.obs_names_make_unique(join="-") #pre-processiong #1.pre filter cells prefilter_cells(source_data, min_genes=100) #2 pre_filter genes prefilter_genes(source_data, min_cells=10) # avoiding all gene is zeros #3 prefilter_specialgene: MT and ERCC prefilter_specialgenes(source_data) #4 normalization,var.genes,log1p,scale sc.pp.normalize_per_cell(source_data) #5 scale sc.pp.log1p(source_data) sc.pp.scale(source_data, zero_center=True, max_value=6) source_data.var_names = [ i.upper() for i in list(source_data.var_names) ] #avoding some gene have lower letter adata_tmp.append(source_data) #Target data target_data.var_names_make_unique(join="-") target_data.obs_names_make_unique(join="-") #pre-processiong #1.pre filter cells prefilter_cells(target_data, min_genes=100) #2 pre_filter genes prefilter_genes(target_data, min_cells=10) # avoiding all gene is zeros #3 prefilter_specialgene: MT and ERCC prefilter_specialgenes(target_data) #4 normalization,var.genes,log1p,scale sc.pp.normalize_per_cell(target_data) # select top genes if target_data.X.shape[0] <= 1500: ng = 500 elif 1500 < target_data.X.shape[0] <= 3000: ng = 1000 else: ng = 2000 sc.pp.filter_genes_dispersion(target_data, n_top_genes=ng) sc.pp.log1p(target_data) sc.pp.scale(target_data, zero_center=True, max_value=6) target_data.var_names = [ i.upper() for i in list(target_data.var_names) ] #avoding some gene have lower letter adata_tmp.append(target_data) #Concat *adata full_adata = AnnData.concatenate(*adata_tmp, join='inner', batch_key="dataset_batch", batch_categories=["source", "target"]) #inner del adata_tmp del target_data del source_data ref_id = full_adata.obs["dataset_batch"] == "source" adata_test = full_adata[~ref_id, :].copy() adata_train = full_adata[ref_id, :].copy() if issparse(adata_train.X): x_train = adata_train.X.toarray() else: x_train = adata_train.X y_train = pd.Series(adata_train.obs["celltype"], dtype="category") y_train = y_train.cat.rename_categories( range(len(y_train.cat.categories))) print("The number of training celltypes is: ", len(set(y_train))) if issparse(adata_test.X): x_test = adata_test.X.toarray() else: x_test = adata_test.X #Training Data dec print("Training the source network") dims = getdims(x_train.shape) #dims=[x_train.shape[1],128,64] print("The layer numbers are" + str(dims[1:])) print(":".join([ "The shape of xtrain is", str(x_train.shape[0]), str(x_train.shape[1]) ])) print(":".join([ "The shape of xtest is", str(x_test.shape[0]), str(x_test.shape[1]) ])) assert x_train.shape[1] == x_test.shape[1] dec = DEC(dims=dims, y=y_train, x=x_train, alpha=alpha, init=self.init, pretrain_epochs=self.pretrain_epochs, actinlayer1="tanh", softmax=softmax) dec.compile(optimizer=SGD(lr=0.01, momentum=0.9)) #print("dec.init_centroid",type(dec.init_centroid),dec.init_centroid) Embeded_z, q_pred = dec.fit_supervise( x=x_train, y=y_train, epochs=2e3, batch_size=self.batch_size) # fine tunning #--------------------------------------------------------------------------------------------------- weights = [i0.get_weights() for i0 in dec.model.layers] features = dec.encoder.predict(x_test) q = dec.model.predict(x_test, verbose=0) #np.savetxt("testq.txt",q) print("Training model finished! Start to fit target network!") val_y_pre = dec.model.predict(x_train, verbose=0) val_y_pre = [np.argmax(i) for i in val_y_pre] val_ari = metrics.adjusted_rand_score(val_y_pre, y_train.tolist()) t0 = time() dec2 = DEC(dims=dims, x=x_test, alpha=alpha, init=self.init, pretrain_epochs=self.pretrain_epochs, actinlayer1="tanh", softmax=softmax, transfer_feature=features, model_weights=weights, y_trans=q.argmax(axis=1)) dec2.compile(optimizer=SGD(0.01, 0.9)) trajectory_z, trajectory_l, Embeded_z, q_pred = dec2.fit_trajectory( x=x_test, tol=tol, epochs_fit=self.epochs_fit, batch_size=self.batch_size) # Fine tunning print("How many trajectories ", len(trajectory_z)) for i in range(len(trajectory_z)): adata_test.obsm["trajectory_Embeded_z_" + str(i)] = trajectory_z[i] adata_test.obs["trajectory_" + str(i)] = trajectory_l[i] #labels=change_to_continuous(q_pred) y_pred = np.asarray(np.argmax(q_pred, axis=1), dtype=int) labels = y_pred.astype('U') labels = pd.Categorical(values=labels, categories=natsorted( np.unique(y_pred).astype('U'))) adata_test.obsm["X_Embeded_z" + str(self.save_atr)] = Embeded_z adata_test.obs["dec" + str(self.save_atr)] = labels adata_test.obs["maxprob" + str(self.save_atr)] = q_pred.max(1) adata_test.obsm["prob_matrix" + str(self.save_atr)] = q_pred adata_test.obsm["X_pcaZ" + str(self.save_atr)] = sc.tl.pca(Embeded_z) self.adata_train = adata_train self.adata_test = adata_test self.dec2 = dec2 self.labels = labels
[z_mean, z_log_var, z] = net.encoder.predict(adata.X) redadata1 = AnnData(X=z_mean, obs=adata.obs) path1 = path + "reference_" prep.nn_embedding(redadata1, path=path1) prep.plotEmbedding(redadata1, path=path1, ncol=10) prep.plotEmbedding(redadata1, path=path1, color_col="cell_type_code", ncol=10) adata, n_batches, input_size = load_data(filename, batch_size, ref=False) [z_mean, z_log_var, z] = net.encoder.predict(adata.X) redadata2 = AnnData(X=z_mean, obs=adata.obs) path2 = path + "test_" prep.nn_embedding(redadata2, path=path2) prep.plotEmbedding(redadata2, path=path2, ncol=10) prep.plotEmbedding(redadata2, path=path2, color_col="cell_type_code", ncol=10) redadata3 = redadata1.concatenate(redadata2) path3 = path + "combined_" prep.nn_embedding(redadata3, path=path3) prep.plotEmbedding(redadata3, path=path3, ncol=10) prep.plotEmbedding(redadata3, path=path3, color_col="cell_type_code", ncol=10) prep.plotEmbedding(redadata3, path=path3, color_col="batch", ncol=10)
def test_concatenate(): # dense data adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), { 'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2'] }, { 'var_names': ['a', 'b', 'c'], 'annoA': [0, 1, 2] }) adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), { 'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4'] }, { 'var_names': ['d', 'c', 'b'], 'annoA': [0, 1, 2] }) adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), { 'obs_names': ['s1', 's2'], 'anno2': ['d3', 'd4'] }, { 'var_names': ['d', 'c', 'b'], 'annoB': [0, 1, 2] }) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.obs_keys() == ['anno1', 'anno2', 'batch'] assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2'] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] adata = adata1.concatenate(adata2, adata3, batch_key='batch1') assert adata.obs_keys() == ['anno1', 'anno2', 'batch1'] adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3']) assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3'] assert adata.var_names.tolist() == ['b', 'c'] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid( np.array([[1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0]])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array([[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) # sparse data from scipy.sparse import csr_matrix adata1 = adata_sparse adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), { 'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4'] }, {'var_names': ['d', 'c', 'b']}) adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]), { 'obs_names': ['s5', 's6'], 'anno2': ['d3', 'd4'] }, {'var_names': ['d', 'c', 'b']}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') assert adata.X.toarray().tolist() == [[0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0]]