def reconstruct(): train_path = "../data/train_pbmc.h5ad" data = sc.read(train_path) ctrl_key = "control" stim_key = "stimulated" all_data = anndata.AnnData() print(data.obs["cell_type"].unique().tolist()) for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()): pca = PCA(n_components=100) train = data[~((data.obs["condition"] == stim_key) & (data.obs["cell_type"] == cell_type))] pca.fit(train.X.A) print(cell_type, end="\t") train_real_stimulated = data[data.obs["condition"] == stim_key, :] train_real_stimulated = train_real_stimulated[ train_real_stimulated.obs["cell_type"] != cell_type] train_real_stimulated = scgen.util.balancer(train_real_stimulated) train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd = data[data.obs["condition"] == ctrl_key, :] train_real_cd = scgen.util.balancer(train_real_cd) train_real_cd_PCA = pca.transform(train_real_cd.X) cell_type_adata = data[data.obs["cell_type"] == cell_type] cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] == ctrl_key] cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] == stim_key] if sparse.issparse(cell_type_ctrl.X): cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A) else: cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, cell_type_ctrl_PCA) if sparse.issparse(cell_type_ctrl.X): all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells])) else: all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X, cell_type_stim.X, predicted_cells])) all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \ cell_type_stim.shape[0] + \ [f"{cell_type}_pred_stim"] * len(predicted_cells) all_Data.obs["cell_type"] = [f"{cell_type}"] * ( cell_type_ctrl.shape[0] + cell_type_stim.shape[0] + len(predicted_cells)) all_Data.var_names = cell_type_adata.var_names if idx == 0: all_data = all_Data else: all_data = all_data.concatenate(all_Data) print(cell_type) sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
def vector_batch_removal(): #projecting data to latent space latent_all = give_me_latent(data.X) latent_ann = sc.AnnData(latent_all) latent_ann.obs["cell_type"] = data.obs["cell_type"].tolist() latent_ann.obs["batch"] = data.obs["batch"].tolist() latent_ann.obs["sample"] = data.obs["sample"].tolist() unique_cell_types = np.unique(latent_ann.obs["cell_type"]) shared_anns = [] not_shared_ann = [] for cell_type in unique_cell_types: temp_cell = latent_ann[latent_ann.obs["cell_type"] == cell_type] if (len(np.unique(temp_cell.obs["batch"])) < 2): cell_type_ann = latent_ann[latent_ann.obs["cell_type"] == cell_type] not_shared_ann.append(cell_type_ann) continue print(cell_type) temp_cell = latent_ann[latent_ann.obs["cell_type"] == cell_type] batch_list = {} max_batch = 0 max_batch_ind = "" batchs = np.unique(temp_cell.obs["batch"]) for i in batchs: temp = temp_cell[temp_cell.obs["batch"] == i] if max_batch < len(temp): max_batch = len(temp) max_batch_ind = i batch_list[i] = temp max_batch_ann = batch_list[max_batch_ind] for study in batch_list: delta = np.average(max_batch_ann.X, axis=0) - np.average( batch_list[study].X, axis=0) batch_list[study].X = delta + batch_list[study].X corrected = sc.AnnData.concatenate(*list(batch_list.values())) shared_anns.append(corrected) all_shared_ann = sc.AnnData.concatenate(*shared_anns) all_not_shared_ann = sc.AnnData.concatenate(*not_shared_ann) all_corrected_data = sc.AnnData.concatenate(all_shared_ann, all_not_shared_ann) #reconstructing data to gene epxression space corrected = sc.AnnData(reconstruct(all_corrected_data.X, use_data=True)) corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist( ) + all_not_shared_ann.obs["cell_type"].tolist() corrected.obs["study"] = all_shared_ann.obs["sample"].tolist( ) + all_not_shared_ann.obs["sample"].tolist() corrected.var_names = data.var_names.tolist() #shared cell_types corrected_shared = sc.AnnData(reconstruct(all_shared_ann.X, use_data=True)) corrected_shared.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist( ) corrected_shared.obs["study"] = all_shared_ann.obs["sample"].tolist() corrected_shared.var_names = data.var_names.tolist() return corrected, corrected_shared
def test_qc_metrics_format(): a = np.random.binomial(100, .005, (1000, 1000)) init_var = pd.DataFrame({ "mito": np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool))) }) adata_dense = sc.AnnData(X=a, var=init_var.copy()) sc.pp.calculate_qc_metrics(adata_dense, qc_vars=["mito"], inplace=True) for fmt in [sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix]: adata = sc.AnnData(X=fmt(a), var=init_var.copy()) sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True) assert np.allclose(adata.obs, adata_dense.obs) for col in adata.var: # np.allclose doesn't like mix of types assert np.allclose(adata.var[col], adata_dense.var[col])
def merge_matrix(ad, obskeys=None, use_raw=False, keep_only_mutual=False): '''merge matrix stored in ad ad: dictionary of anndata to merge obskeys: list to merge within anndata use_raw: if True, merge from .raw.X''' smp_list = list(ad.keys()) obs_dict = defaultdict(list) obs_names = [] for smp in smp_list: ad[smp].obs['name'] = smp if not obskeys: obskey_list = [] obskeys = [] for sample in smp_list: obskey_list.extend(list(ad[sample].obs.columns)) for (obskey, number) in Counter(obskey_list).items(): if number == len(smp_list): obskeys.append(obskey) else: if keep_only_mutual: pass else: for sample in smp_list: if obskey not in ad[sample].obs.columns: ad[sample].obs[obskey] = 'n/a' obskeys.append(obskey) for sample in smp_list: obs_names.extend(list(ad[sample].obs_names)) for key in obskeys: obs_dict[key].extend(list(ad[sample].obs[key])) from scipy.sparse import vstack if use_raw == True: stack = vstack([ad[x].raw.X for x in smp_list]) # stack data adata = sc.AnnData(stack, var=ad[smp_list[0]].raw.var) else: stack = vstack([ad[x].X for x in smp_list]) # stack data adata = sc.AnnData(stack, var=ad[smp_list[0]].var) adata.obs_names = obs_names print(len(adata)) for obs_col in obs_dict: print(obs_col) adata.obs[obs_col] = obs_dict[obs_col] return adata
def poi_data_gen(p, x_grid, Nc=10000, Nr=5, G=2, require_X=False, sigma=0.2): X = np.zeros([Nc, G], dtype=float) for i in range(G): temp = np.random.choice(x_grid, Nc, p=p, replace=True) X[:, i] = temp #X[:,-1] = 1 #X = (X.T/np.sum(X,axis=1)).T # normalize to be a probability distribution new_Nr = Nr * Nc / X.sum() ## sample the size factor size_factor = np.random.randn(Nc) * sigma + 1 size_factor = size_factor.clip(min=0.5) ## generating the reads Y = np.random.poisson((X.T * size_factor).T * new_Nr) Y = sp.sparse.csr_matrix(Y) ## assign some fake gene names gene_name = [] for i in range(G): gene_name.append('gene %d' % (i)) var = pd.DataFrame(index=gene_name) data = sc.AnnData(Y, var=var) if require_X: return data, size_factor, X else: return data, size_factor
def poi_data_gen_nd(p, val, Nc=10000, Nr=5, sigma=0.2, random_seed=0): """Add Poisson noise to the data. """ np.random.seed(random_seed) val_size, G = val.shape rand_ind = np.random.choice(np.arange(val_size), Nc, p=p, replace=True) X = val[rand_ind, :] new_Nr = Nr * Nc / X.sum() ## sample the size factor size_factor = np.random.randn(Nc) * sigma + 1 #size_factor = np.random.randn(Nc)*0 + 1 size_factor = size_factor.clip(min=0.5) ## generating the reads Y = np.random.poisson((X.T * size_factor).T * new_Nr) Y = sp.sparse.csr_matrix(Y) ## assign some fake gene names gene_name = [] for i in range(G): gene_name.append('gene %d' % (i)) var = pd.DataFrame(index=gene_name) data = sc.AnnData(Y, var=var) return data, size_factor
def creatadata(datadir=None,exprmatrix=None,expermatrix_filename="matrix.mtx",is_mtx=True,cell_info=None,cell_info_filename="barcodes.tsv",gene_info=None,gene_info_filename="genes.tsv",project_name=None): """ Construct a anndata object Construct a anndata from data in memory or files on disk. If datadir is a dir, there must be at least include "matrix.mtx" or data.txt(without anly columns name or rowname and sep="\t") , """ if (datadir is None and expermatrix is None and expermatrix_filename is None): raise ValueError("Please provide either the expression matrix or the ful path to the expression matrix!!") #something wrong cell_info=pd.DataFrame(["cell_"+str(i) for i in range(1,x.shape[0]+1)],columns=["cellname"]) if cell_info is not None else cell_info gene_info=pd.DataFrame(["gene_"+str(i) for i in range(1,x.shape[1]+1)],columns=["genename"]) if gene_info is not None else gene_info if datadir is not None: cell_and_gene_file = [f for f in os.listdir(datadir) if os.path.isfile(os.path.join(datadir, f))] if (os.path.isdir(datadir) and is_mtx==True): #sparse print("Start to read expression data (matrix.mtx)") x=sc.read_mtx(os.path.join(datadir,expermatrix_filename)).X.T else: #nonsparse x=pd.read_csv(os.path.join(datadir,expermatrix_filename),sep="\t",header=F) #only matrix with row names and colnames if cell_info_filename in cell_and_gene_file: cell_info=pd.read_csv(os.path.join(datadir,cell_info_filename),sep="\t",header=0,na_filter=False) if gene_info_filename in cell_and_gene_file: gene_info=pd.read_csv(os.path.join(datadir,gene_info_filename),sep="\t",header=0,na_filter=False) else: x=exprmatrix # n*p matrix, cell* gene adata=sc.AnnData(x,obs=cell_info,var=gene_info) a=adata.obs["cellname"] if "cellname" in adata.obs.keys() else adata.obs.index adata.var_names=adata.var["genename"] if "genename" in adata.var.keys() else adata.var.index adata.obs_names_make_unique(join="-") adata.var_names_make_unique(join="-") adata.uns["ProjectName"]="DEC_clust_algorithm" if project_name is None else project_name return adata
def preprocess(X, nb_genes = 500): """ Preprocessing phase as proposed in scanpy package. Keeps only nb_genes most variable genes and normalizes the data to 0 mean and 1 std. Args: X ([type]): [description] nb_genes (int, optional): [description]. Defaults to 500. Returns: [type]: [description] """ X = np.ceil(X).astype(np.int) count_X = X print(X.shape, count_X.shape, f"keeping {nb_genes} genes") orig_X = X.copy() adata = sc.AnnData(X) adata = utils.normalize(adata, copy=True, highly_genes=nb_genes, size_factors=True, normalize_input=True, logtrans_input=True) X = adata.X.astype(np.float32) return X
def impute_neighbor(bdata, n_neighbor=10): from scipy.spatial import cKDTree from sklearn.neighbors import KDTree import multiprocessing as mp n_jobs = mp.cpu_count() # Get neighborhood structure based on ckd = cKDTree(bdata.obsm["X_umap"]) ckdout = ckd.query(x=bdata.obsm["X_umap"], k=n_neighbor, n_jobs=n_jobs) indices = ckdout[1] sum_list = [] import scipy for i in range(0, bdata.raw.X.shape[0], 10000): start = i end = min(i + 10000, bdata.raw.X.shape[0]) X_list = [ bdata.raw.X[indices[start:end, i]] for i in range(n_neighbor) ] X_sum = scipy.sparse.csr_matrix(np.sum(X_list) / n_neighbor) sum_list.append(X_sum) print(i) imputed = scipy.sparse.vstack(sum_list) idata = sc.AnnData(imputed) idata.obs = bdata.obs.copy() idata.var = bdata.raw.var.copy() idata.obsm = bdata.obsm.copy() idata.uns = bdata.uns.copy() return idata
def filter_data(X, highly_genes=500): """ Remove less variable genes Args: X ([type]): [description] highly_genes (int, optional): [description]. Defaults to 500. Returns: [type]: [description] """ X = np.ceil(X).astype(np.int) adata = sc.AnnData(X) sc.pp.filter_genes(adata, min_counts=3) sc.pp.filter_cells(adata, min_counts=1) sc.pp.normalize_per_cell(adata) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=4, min_disp=0.5, n_top_genes=highly_genes, subset=True) genes_idx = np.array(adata.var_names.tolist()).astype(int) cells_idx = np.array(adata.obs_names.tolist()).astype(int) return genes_idx, cells_idx
def preprocess(hdf5_file, out_path, n_top_genes): h5f = h5py.File(hdf5_file,'r') matrix = h5f['matrix'].value adata = sc.AnnData(matrix) print(adata.X.shape) # do not normalize after cell_cycle effects are regressed out (negative values are introduced) #sc.pp.normalize_per_cell(adata) # normalize with total UMI count per cell print(adata.X.shape) filter_result = sc.pp.filter_genes_dispersion(adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) # filter results is a recarray # mask2 to select the top 1000 genes mask2 = filter_result.gene_subset adata = adata[:, mask2] #sc.pp.normalize_per_cell(adata) # need to redo normalization after filtering # Writing the output hdf5 files matrix = adata.X f = h5py.File(out_path, "w") f.create_dataset(name = 'matrix', data = matrix) gg = f.create_group('gene_attrs') cg = f.create_group('cell_attrs') print(h5f['gene_attrs'].keys()) for key in h5f['gene_attrs'].keys(): # apply the masks to the gene attributes gg.create_dataset(name = key, data =h5f['gene_attrs'][key].value[mask2]) for key in h5f['cell_attrs'].keys(): cg.create_dataset(name = key, data = h5f['cell_attrs'][key].value) f.close() h5f.close()
def balancer(self, data): class_names = np.unique(data.obs[self.cell_type_key]) class_pop = {} for cls in class_names: class_pop[cls] = len(data[data.obs[self.cell_type_key] == cls]) max_number = np.max(list(class_pop.values())) all_data_x = [] all_data_label = [] all_data_condition = [] for cls in class_names: temp = data[data.obs[self.cell_type_key] == cls] index = np.random.choice(range(len(temp)), max_number) temp_x = temp.X[index] all_data_x.append(temp_x) temp_ct = np.repeat(cls, max_number) all_data_label.append(temp_ct) temp_cc = np.repeat(np.unique(temp.obs["condition"]), max_number) all_data_condition.append(temp_cc) balanced_data = sc.AnnData(np.concatenate(all_data_x)) balanced_data.obs[self.cell_type_key] = np.concatenate(all_data_label) balanced_data.obs["condition"] = np.concatenate(all_data_label) class_names = np.unique(balanced_data.obs[self.cell_type_key]) class_pop = {} for cls in class_names: class_pop[cls] = len(balanced_data[balanced_data.obs[self.cell_type_key] == cls]) # print(class_pop) return balanced_data
def regress_batch_v2(adata, batch_key, confounder_key): '''batch regression tool batch_key=list of observation categories to be regressed out confounder_key=list of observation categories to be kept returns ndata with corrected X''' from sklearn.linear_model import Ridge dummy = pd.get_dummies(adata.obs[batch_key + confounder_key], drop_first=False) X_exp = adata.X # scaled data if scipy.sparse.issparse(X_exp): X_exp = X_exp.todense() LR = Ridge(fit_intercept=False, alpha=1.0) LR.fit(dummy, X_exp) if len(batch_key) > 1: batch_index = np.logical_or.reduce( np.vstack([dummy.columns.str.startswith(x) for x in batch_key])) else: batch_index = np.vstack( [dummy.columns.str.startswith(x) for x in batch_key])[0] dm = np.array(dummy)[:, batch_index] X_explained = dm.dot(LR.coef_[:, batch_index].T) X_remain = X_exp - X_explained ndata = sc.AnnData(X_remain) ndata.obs = adata.obs ndata.var = adata.var return ndata, X_explained
def save_generated_cells(fake_cells, file_name, fake_labels=None): """ Functions that writes a gene expression matrix and the associated cluster indices into a file. Check the AnnData documentation of the write method to check the supported formats. Parameters ---------- fake_cells : 2-D array A matrix (cells x genes) containing the expression levels. It can be dense or sparse. It will be encoded in a sparse format. file_name : str Path of the file to write to. fake_labels : array an array containing the cluster indices of the corresponding cells. Default is None. Returns ------- """ s_gen_mat = sp_sparse.csr_matrix(fake_cells) sc_fake = sc.AnnData(s_gen_mat) if fake_labels is not None: groups = fake_labels.astype('U') unique_groups = np.unique(groups) sc_fake.obs['cluster'] = pd.Categorical( values=groups, categories=natsorted(unique_groups)) sc_fake.obs_names = np.repeat('fake', sc_fake.shape[0]) sc_fake.obs_names_make_unique() sc_fake.write(file_name)
def get_common_var_raw(a,b): common = sorted(list(set(a.raw.var_names).intersection(set(b.raw.var_names)))) list_a_names = list(a.raw.var_names) list_b_names = list(b.raw.var_names) a_index = np.array([list_a_names.index(x) for x in common]) b_index = np.array([list_b_names.index(x) for x in common]) print('calculating a...') a_new_X = a.raw.X[:,a_index] print('calculating b...') b_new_X = b.raw.X[:,b_index] a_new = sc.AnnData(a_new_X,obs = a.obs) a_new.obsm = a.obsm a_new.var_names = common b_new = sc.AnnData(b_new_X,obs = b.obs) b_new.obsm = b.obsm b_new.var_names = common return a_new,b_new
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"): train_path = f"../data/train_{data_name}.h5ad" if data_name == "pbmc": ctrl_key = "control" stim_key = "stimulated" cell_type_key = "cell_type" elif data_name == "hpoly": ctrl_key = "Control" stim_key = "Hpoly.Day10" cell_type_key = "cell_label" elif data_name == "salmonella": ctrl_key = "Control" stim_key = "Salmonella" cell_type_key = "cell_label" data = sc.read(train_path) print("data has been loaded!") train = data[~((data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type))] pca = PCA(n_components=100) pca.fit(train.X.A) train_real_cd = train[train.obs["condition"] == "control", :] if p_type == "unbiased": train_real_cd = scgen.util.balancer(train_real_cd) train_real_stimulated = train[train.obs["condition"] == "stimulated", :] if p_type == "unbiased": train_real_stimulated = scgen.util.balancer(train_real_stimulated) import scipy.sparse as sparse if sparse.issparse(train_real_cd.X): train_real_cd.X = train_real_cd.X.A train_real_stimulated.X = train_real_stimulated.X.A train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd_PCA = pca.transform(train_real_cd.X) adata_list = scgen.util.extractor(data, cell_type, { "ctrl": ctrl_key, "stim": stim_key }) if sparse.issparse(adata_list[1].X): adata_list[1].X = adata_list[1].X.A adata_list[2].X = adata_list[2].X.A ctrl_CD4T_PCA = pca.transform(adata_list[1].X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type) all_Data = sc.AnnData( np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells])) all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \ ["pred_stim"] * len(predicted_cells) all_Data.var_names = adata_list[3].var_names if p_type == "unbiased": sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data) else: sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad", all_Data)
def get_subset(idata, select, cc_genes=cc_genes, log=False, raw=True): if raw: adata = sc.AnnData(idata[select].raw.X) adata.var = idata.raw.var else: adata = sc.AnnData(idata[select].X) adata.var = idata.var adata.obs = idata.obs[select] adata.raw = adata.copy() #adata.X = scipy.sparse.csr_matrix(np.exp(adata.X.todense())-1) sc.pp.filter_genes_dispersion(adata, log=log) if log: sc.pp.log1p(adata) sc.pp.scale(adata, max_value=10) if len(cc_genes) > 0: remove_geneset(adata, cc_genes) sc.pp.pca(adata, n_comps=np.min([50, adata.X.shape[0], adata.X.shape[1]])) return adata
def load_klein(): df_klein = pd.read_csv('/data/martin/single_cell/klein/data', sep=',') index_name = list(df_klein.iloc[:, 0]) mat_klein = np.array(df_klein.iloc[:, 1:].as_matrix(), dtype=int).T # Convert to AnnData temp = sp.sparse.csr_matrix(mat_klein) data_klein = sc.AnnData(temp) data_klein.var_names = index_name return data_klein
def read_dataset(input_file, transpose=False): """ Construct a anndata object """ if os.path.isfile(input_file): print("The value os", os.path.isfile(input_file)) if str(input_file).endswith('h5ad'): adata = sc.read(input_file) elif sum([ str(input_file).endswith(str(i)) for i in ["tsv", 'TSV', 'tab', 'data'] ]): adata = sc.read_text(input_file, sep="\t", first_column_names=True) if transpose: adata = adata.T elif sum([str(input_file).endswith(str(i)) for i in ['csv', "CSV"]]): adata = sc.read_text(input_file, sep=",", first_column_names=True) if transpose: adata = adata.T else: #ValueError 'The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV' print( "The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV" ) else: #read folder mtx = sc.read_mtx(os.path.join(input_file, "matrix.mtx")) num_lines = sum( 1 for line in open(os.path.join(input_file, 'barcodes.tsv'))) cellinfo = pd.read_csv(os.path.join(input_file, "barcodes.tsv"), sep="\t", header=None if num_lines == mtx.shape[1] else 0) if not 'cellname' in cellinfo.columns: cellinfo['cellname'] = cellinfo.iloc[:, 0] num_lines = sum( 1 for line in open(os.path.join(input_file, 'genes.tsv'))) geneinfo = pd.read_csv(os.path.join(input_file, "genes.tsv"), sep="\t", header=None if num_lines == mtx.shape[0] else 0) if not 'genename' in geneinfo.columns: geneinfo[ 'genename'] = geneinfo.iloc[:, 1] # for 10x,the second columns is the genename, and the first column is gene_id #create anndata adata = sc.AnnData(mtx.X.T, obs=cellinfo, var=geneinfo) adata.obs_names = adata.obs["cellname"] adata.var_names = adata.var["genename"] adata.obs_names_make_unique(join="-") adata.var_names_make_unique(join="-") #create time now = datetime.datetime.now() adata.uns["ProjectName"] = "DESC created in" + str( now.strftime("%Y-%m-%d %H:%M")) print("Creat adata successfully! The adata infofation is", adata) return adata
def load_klein_ercc(): df_klein_ercc = pd.read_csv( '/data/martin/single_cell/ERCC_data/ERCC/klein.txt', sep=' ') index_name = list(df_klein_ercc.index) mat_klein_ercc = np.array(df_klein_ercc.as_matrix()).T # Convert to AnnData temp = sp.sparse.csr_matrix(mat_klein_ercc) data_klein_ercc = sc.AnnData(temp) data_klein_ercc.var_names = index_name return data_klein_ercc
def remove_cell_cycle(input_file, out_file): h5f = h5py.File(input_file, 'r') matrix = h5f['matrix'][:] gene_names = h5f['gene_attrs']['gene_names'].value decoder = np.vectorize(lambda t: t.decode('UTF-8')) gene_names = decoder(gene_names) adata = sc.AnnData(X=matrix, var=gene_names) # Load cell cycle genes defined in [Tirosh et al, 2015](https://doi.org/10.1126/science.aad0501). # It is a list of 97 genes, represented by their gene symbol. cell_cycle_genes = [ x.strip() for x in open('./data/regev_lab_cell_cycle_genes.txt') ] s_genes = cell_cycle_genes[:43] g2m_genes = cell_cycle_genes[43:] cell_cycle_genes = [ x for x in cell_cycle_genes if x in adata.var[0].values ] # this is needed otherwise scanpy cannot tell the index adata.var_names = gene_names # Log-transformation of data and scaling should always be performed before scoring # sc.pp.log1p(adata) sc.pp.normalize_per_cell(adata) # sc.pp.scale(adata) # calculate the cell cycle scores sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes) sc.pp.regress_out(adata, ['S_score', 'G2M_score']) # sc.pp.scale(adata) matrix = adata.X cell_phase = np.array(adata.obs['phase'].values, dtype='S10') # write the output f = h5py.File(out_file, "w") f.create_dataset(name='matrix', data=matrix) gg = f.create_group('gene_attrs') cg = f.create_group('cell_attrs') cg.create_dataset(name='cell_phase', data=cell_phase) for key in h5f['gene_attrs'].keys(): gg.create_dataset(name=key, data=h5f['gene_attrs'][key].value) for key in h5f['cell_attrs'].keys(): cg.create_dataset(name=key, data=h5f['cell_attrs'][key].value) f.close() h5f.close()
def load( loc="data_files", blocksize=1000000, anndata_write=True, anndata_name="mouse_retina.h5ad", X_dtype=np.float32, ): adata_fpath = os.path.join(loc, anndata_name) # if we've already down;loaded and constructed the adata file, read it and use it if os.path.exists(adata_fpath) and os.path.isfile(adata_fpath): print("reading saved anndata h5ad file") adata = sc.read_h5ad(adata_fpath) # if anndata doesn't exit alread, download inputs and construct it else: # download files if they don't exist locally if not os.path.exists(loc): os.makedirs(loc) files = { "10x_mouse_retina_development.mtx": "https://www.dropbox.com/s/6d76z4grcnaxgcg/10x_mouse_retina_development.mtx?dl=1", "10x_mouse_retina_development_phenotype.csv": "https://www.dropbox.com/s/y5lho9ifzoktjcs/10x_mouse_retina_development_phenotype.csv?dl=1", "10x_mouse_retina_development_feature.csv": "https://www.dropbox.com/s/1mc4geu3hixrxhj/10x_mouse_retina_development_feature.csv?dl=1", } print("downloading data files") for fname, url in files.items(): if not os.path.exists(os.path.join(loc, fname)): download_file(url, loc=loc, blocksize=blocksize) # read in data print("reading data files") df_obs = pd.read_csv( os.path.join(loc, "10x_mouse_retina_development_phenotype.csv"), index_col=0 )[["barcode", "sample", "age", "CellType"]] df_var = pd.read_csv( os.path.join(loc, "10x_mouse_retina_development_feature.csv"), index_col=0 )[["id", "gene_short_name"]] count_mat = mmread(os.path.join(loc, "10x_mouse_retina_development.mtx")) # make anndata object print("constructing anndata object") adata = sc.AnnData( X=count_mat.toarray().astype(X_dtype).transpose(), obs=df_obs, var=df_var ) genes_to_keep = np.mean(adata.X != 0, axis=0) > 0 cells_to_keep = np.mean(adata.X != 0, axis=1) > 0 adata = adata[:, genes_to_keep][cells_to_keep, :].copy() # save a local copy if anndata_write: print("saving annndata h5ad file") adata.write(adata_fpath) return adata
def load_svensson_2x(): input_folder = '/data/martin/single_cell/ERCC_data/ERCC' df_s2_ercc = pd.read_csv( '/data/martin/single_cell/ERCC_data/ERCC/svensson2X.txt', sep=' ') index_name = list(df_s2_ercc.index) mat_s2_ercc = np.array(df_s2_ercc.as_matrix()).T # Convert to AnnData temp = sp.sparse.csr_matrix(mat_s2_ercc) data_s2_ercc = sc.AnnData(temp) data_s2_ercc.var_names = index_name return data_s2_ercc
def to_AnnData(Y, gene_list=None): """ Convert a ndarray to AnnData with sparse csr reads """ Y = sp.sparse.csr_matrix(Y) if gene_list is None: gene_list = [] for i in range(Y.shape[1]): gene_list.append('gene %d' % (i)) var = pd.DataFrame(index=gene_list) data = sc.AnnData(Y, var=var) return data
def load( split="train", original_fpath="/allen/aics/modeling/data/scRNAseq_SeeligCollaboration/data_for_modeling/scrnaseq_cardio_20181129.h5ad", cache_dir="data_cache", cache=True, selected_genes_path=None, threshold=0, ): """ Load requested split of cardio data, where the whole dataset originated at original_fpath. Looks for local cache of split, and if it can't find that, makes a split on the fly. If cache=True, caches the result in cache_dir for next time. Loads raw count values. """ original_fname = os.path.basename(original_fpath) original_bname, original_ext = os.path.splitext(original_fname) target_fname = "{0}_{1}{2}".format(original_bname, split, original_ext) target_fpath = os.path.join(cache_dir, target_fname) if not os.path.exists(target_fpath): adata_in = sc.read_h5ad(original_fpath) adata_raw = sc.AnnData( X=adata_in.raw.X.todense(), obs=adata_in.obs, var=adata_in.var, uns=adata_in.uns, ) split_inds, split_adata = split_anndata(adata_raw) if cache: write_splits( split_inds_dict=split_inds, split_adata_dict=split_adata, basename=original_bname, out_dir=cache_dir, ) adata = sc.read_h5ad(target_fpath) if selected_genes_path is not None: df = pd.read_csv(selected_genes_path, delimiter="\t") coding_genes = df["Gene name"].unique() coding_genes = [str(g) + "_HUMAN" for g in coding_genes] cols = np.array([c for c in adata.var.index if c in coding_genes]) adata = adata[:, cols] gene_nz_freq = (adata.X > 0).mean(axis=0) adata = adata[:, cols[gene_nz_freq > threshold]] return adata
def DCATransform(sc_data_matrix): # Create a scanpy AnnData object sc_data_matrix = sc.AnnData(numpy.transpose(sc_data_matrix.values)) # Filter genes with count<2 sc.pp.filter_genes(data=sc_data_matrix, min_counts=1) # Apply DCA transform dca(adata=sc_data_matrix, threads=4, epochs=10) print("DCA Denoised data prepared") return numpy.transpose(sc_data_matrix.X)
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"): train_path = f"../data/train_{data_name}.h5ad" if data_name == "pbmc": ctrl_key = "control" stim_key = "stimulated" cell_type_key = "cell_type" elif data_name == "hpoly": ctrl_key = "Control" stim_key = "Hpoly.Day10" cell_type_key = "cell_label" elif data_name == "salmonella": ctrl_key = "Control" stim_key = "Salmonella" cell_type_key = "cell_label" data = sc.read(train_path) print("data has been loaded!") ctrl_cell = data[(data.obs["condition"] == ctrl_key) & (data.obs[cell_type_key] == cell_type)] stim_cell = data[(data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type)] train_real_cd = data[data.obs["condition"] == "control", :] if p_type == "unbiased": train_real_cd = scgen.util.balancer(train_real_cd) train_real_stimulated = data[data.obs["condition"] == "stimulated", :] train_real_stimulated = train_real_stimulated[train_real_stimulated.obs["cell_type"] != "CD4T"] if p_type == "unbiased": train_real_stimulated = scgen.util.balancer(train_real_stimulated) import scipy.sparse as sparse if sparse.issparse(train_real_cd.X): train_real_cd = train_real_cd.X.A train_real_stimulated = train_real_stimulated.X.A else: train_real_cd = train_real_cd.X train_real_stimulated = train_real_stimulated.X if sparse.issparse(ctrl_cell.X): ctrl_cell.X = ctrl_cell.X.A stim_cell.X = stim_cell.X.A predicted_cells = predict(train_real_cd, train_real_stimulated, ctrl_cell.X) print("Prediction has been finished") all_Data = sc.AnnData(np.concatenate([ctrl_cell.X, stim_cell.X, predicted_cells])) all_Data.obs["condition"] = ["ctrl"] * ctrl_cell.shape[0] + ["real_stim"] * stim_cell.shape[0] + \ ["pred_stim"] * len(predicted_cells) all_Data.var_names = ctrl_cell.var_names if p_type == "unbiased": sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T.h5ad", all_Data) else: sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T_biased.h5ad", all_Data)
def init_scanpy(data, col_names, head_name, true_labels, fin, k=30, n_pcs=20, computeEmbedding=True): head_idx = np.where(true_labels == head_name)[0] if len(head_idx) > 1: D = pairwise_distances(data[head_idx, :], metric='euclidean') iroot = head_idx[np.argmin(D.sum(axis=0))] else: iroot = head_idx[0] adata = sc.AnnData(data) adata.var_names = col_names adata.obs['labels'] = true_labels adata.uns['iroot'] = iroot if computeEmbedding: if n_pcs: sc.pp.pca(adata, n_comps=n_pcs) sc.pp.neighbors(adata, n_neighbors=k, n_pcs=n_pcs) else: sc.pp.neighbors(adata, n_neighbors=k) sc.tl.louvain(adata, resolution=0.9) louvain_labels = np.array(list(adata.obs['louvain'])) sc.tl.paga(adata) sc.tl.draw_graph(adata) sc.tl.diffmap(adata) sc.tl.tsne(adata) sc.tl.umap(adata) sc.tl.pca(adata, n_comps=2) sc.pl.paga(adata) sc.tl.draw_graph(adata, init_pos='paga') else: louvain_labels = [] sc.settings.figdir = fin sc.settings.autosave = True # sc.settings.set_figure_params(dpi=80, dpi_save=300, color_map='Set1', format='pdf') sc.settings.set_figure_params(dpi=80, dpi_save=300, format='pdf') return adata, iroot, louvain_labels
def run_leiden(data, params ={}): """ Performs Leiden community detection on given data. Args: data ([type]): [description] n_neighbors (int, optional): [description]. Defaults to 10. n_pcs (int, optional): [description]. Defaults to 40. Returns: [type]: [description] """ import scanpy.api as sc adata = sc.AnnData(data) sc.pp.neighbors(adata, use_rep='X', n_neighbors = 300, n_pcs = 0) sc.tl.leiden(adata, **params) pred = adata.obs['leiden'].to_list() pred = [int(x) for x in pred] return pred
def createAnnDataObject(cell_file, feature_file, count_file, feature_name): #read in files cell = pd.read_csv(cell_file, sep=',') feature = pd.read_csv(feature_file, sep=',') count = scipy.io.mmread(count_file) # transpose to that each row corresponds to cell, each column corresponds to gene or peak adata_t = sc.AnnData(count.toarray()) adata = sc.AnnData.transpose(adata_t) # set indices for obs and var cell.set_index('sample', inplace=True) feature.set_index(feature_name, inplace=True) adata.obs = cell adata.var = feature return adata