def zeisel(): """Prepare Zeisel dataset Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq by Zeisel, et al. in Science. 2015. """ df = pd.read_csv( "data/zeisel/expression_mRNA_17-Aug-2014.txt", sep="\t", header=0, index_col=0, skiprows=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ).T zeisel = AnnData(df.values[1:, :]) zeisel.obs_names = df.index[1:] zeisel.var_names = df.columns anndf = pd.read_csv( "data/zeisel/expression_mRNA_17-Aug-2014.txt", sep="\t", header=0, index_col=1, nrows=10, ).T annotations = anndf.iloc[1:, :-1] zeisel.obs["group"] = annotations["group #"] zeisel.obs["sex"] = annotations["sex"] annotations.columns zeisel.obs["tot mRNA"] = annotations["total mRNA mol"] zeisel.obs["age"] = annotations["age"] zeisel.obs["diameter"] = annotations["diameter"] pr.read.process_clusts(zeisel, "group") sc.write("data/zeisel/zeisel.h5ad", zeisel) ft = pr.performance.FoldTester(zeisel) ft.makefolds(random=True) ft.savefolds("output/zeisel_folds.npz")
def green(): """Prepare the Green dataset A Comprehensive Roadmap of Murine Spermatogenesis Defined by Single-Cell RNA-Seq by Green et al. in Developmental Cell. 2018. """ adata = sc.read_csv("data/green/GSE112393_MergedAdultMouseST25_DGE.txt.gz", delimiter="\t").T adata.X = scipy.sparse.csc_matrix(adata.X) df = pd.read_csv( "data/green/GSE112393_MergedAdultMouseST25_PerCellAttributes.txt.gz", sep="\t", skiprows=3, ) df = df.set_index("#CellBarcode") adata.obs = adata.obs.merge(df, how="left", left_index=True, right_index=True, validate="1:1") sc.write("data/green/green.h5ad", adata) pr.performance.process_clusts(adata, "CellType") ft = pr.performance.FoldTester(adata) ft.makefolds(random=True) ft.savefolds("output/green_folds.npz")
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"): train_path = f"../data/train_{data_name}.h5ad" if data_name == "pbmc": ctrl_key = "control" stim_key = "stimulated" cell_type_key = "cell_type" elif data_name == "hpoly": ctrl_key = "Control" stim_key = "Hpoly.Day10" cell_type_key = "cell_label" elif data_name == "salmonella": ctrl_key = "Control" stim_key = "Salmonella" cell_type_key = "cell_label" data = sc.read(train_path) print("data has been loaded!") train = data[~((data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type))] pca = PCA(n_components=100) pca.fit(train.X.A) train_real_cd = train[train.obs["condition"] == "control", :] if p_type == "unbiased": train_real_cd = scgen.util.balancer(train_real_cd) train_real_stimulated = train[train.obs["condition"] == "stimulated", :] if p_type == "unbiased": train_real_stimulated = scgen.util.balancer(train_real_stimulated) import scipy.sparse as sparse if sparse.issparse(train_real_cd.X): train_real_cd.X = train_real_cd.X.A train_real_stimulated.X = train_real_stimulated.X.A train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd_PCA = pca.transform(train_real_cd.X) adata_list = scgen.util.extractor(data, cell_type, { "ctrl": ctrl_key, "stim": stim_key }) if sparse.issparse(adata_list[1].X): adata_list[1].X = adata_list[1].X.A adata_list[2].X = adata_list[2].X.A ctrl_CD4T_PCA = pca.transform(adata_list[1].X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type) all_Data = sc.AnnData( np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells])) all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \ ["pred_stim"] * len(predicted_cells) all_Data.var_names = adata_list[3].var_names if p_type == "unbiased": sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data) else: sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad", all_Data)
def reconstruct(): train_path = "../data/train_pbmc.h5ad" data = sc.read(train_path) ctrl_key = "control" stim_key = "stimulated" all_data = anndata.AnnData() print(data.obs["cell_type"].unique().tolist()) for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()): pca = PCA(n_components=100) train = data[~((data.obs["condition"] == stim_key) & (data.obs["cell_type"] == cell_type))] pca.fit(train.X.A) print(cell_type, end="\t") train_real_stimulated = data[data.obs["condition"] == stim_key, :] train_real_stimulated = train_real_stimulated[ train_real_stimulated.obs["cell_type"] != cell_type] train_real_stimulated = scgen.util.balancer(train_real_stimulated) train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd = data[data.obs["condition"] == ctrl_key, :] train_real_cd = scgen.util.balancer(train_real_cd) train_real_cd_PCA = pca.transform(train_real_cd.X) cell_type_adata = data[data.obs["cell_type"] == cell_type] cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] == ctrl_key] cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] == stim_key] if sparse.issparse(cell_type_ctrl.X): cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A) else: cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, cell_type_ctrl_PCA) if sparse.issparse(cell_type_ctrl.X): all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells])) else: all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X, cell_type_stim.X, predicted_cells])) all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \ cell_type_stim.shape[0] + \ [f"{cell_type}_pred_stim"] * len(predicted_cells) all_Data.obs["cell_type"] = [f"{cell_type}"] * ( cell_type_ctrl.shape[0] + cell_type_stim.shape[0] + len(predicted_cells)) all_Data.var_names = cell_type_adata.var_names if idx == 0: all_data = all_Data else: all_data = all_data.concatenate(all_Data) print(cell_type) sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
def predict(self, adata, colnames=None, dimreduce=True, reconstruct=True, error=True): res = {} colnames = adata.var_names.values if colnames is None else colnames rownames = adata.obs_names.values print('Calculating low dimensional representations...') res['reduced'] = self.encoder.predict({ 'count': adata.X, 'size_factors': adata.obs.size_factors }) print('Calculating reconstructions...') res['mean'] = self.model.predict({ 'count': adata.X, 'size_factors': adata.obs.size_factors }) res['mean_norm'] = self.extra_models['mean_norm'].predict(adata.X) if self.file_path: print('Saving files...') os.makedirs(self.file_path, exist_ok=True) write_text_matrix(res['reduced'], os.path.join(self.file_path, 'reduced.tsv'), rownames=rownames, transpose=False) #write_text_matrix(res['decoded'], os.path.join(self.file_path, 'decoded.tsv')) write_text_matrix(res['mean'], os.path.join(self.file_path, 'mean.tsv'), rownames=rownames, colnames=colnames, transpose=True) sc.settings.writedir = self.file_path + '/' sc.write('output', adata) write_text_matrix(res['mean_norm'], os.path.join(self.file_path, 'mean_norm.tsv'), rownames=rownames, colnames=colnames, transpose=True) return res
def zheng(): """Prepare the Zheng dataset Massively parallel digital transcriptional profiling of single cells. by Zheng GX, et al. in Nature Communications. 2017. """ pbmc_68k = sc.read_10x_mtx("data/zheng/filtered_matrices_mex/hg19/") bl = pd.read_csv("data/zheng/zheng17_bulk_lables.txt", header=None) pbmc_68k.obs["bulk_labels"] = bl.values pr.read.process_clusts(pbmc_68k, "bulk_labels") sc.write("data/zheng/fresh_68k_bulk_labels.h5ad", pbmc_68k) ft = pr.performance.FoldTester(pbmc_68k) ft.makefolds(random=True) ft.savefolds("output/zheng_folds.npz")
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"): train_path = f"../data/train_{data_name}.h5ad" if data_name == "pbmc": ctrl_key = "control" stim_key = "stimulated" cell_type_key = "cell_type" elif data_name == "hpoly": ctrl_key = "Control" stim_key = "Hpoly.Day10" cell_type_key = "cell_label" elif data_name == "salmonella": ctrl_key = "Control" stim_key = "Salmonella" cell_type_key = "cell_label" data = sc.read(train_path) print("data has been loaded!") ctrl_cell = data[(data.obs["condition"] == ctrl_key) & (data.obs[cell_type_key] == cell_type)] stim_cell = data[(data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type)] train_real_cd = data[data.obs["condition"] == "control", :] if p_type == "unbiased": train_real_cd = scgen.util.balancer(train_real_cd) train_real_stimulated = data[data.obs["condition"] == "stimulated", :] train_real_stimulated = train_real_stimulated[train_real_stimulated.obs["cell_type"] != "CD4T"] if p_type == "unbiased": train_real_stimulated = scgen.util.balancer(train_real_stimulated) import scipy.sparse as sparse if sparse.issparse(train_real_cd.X): train_real_cd = train_real_cd.X.A train_real_stimulated = train_real_stimulated.X.A else: train_real_cd = train_real_cd.X train_real_stimulated = train_real_stimulated.X if sparse.issparse(ctrl_cell.X): ctrl_cell.X = ctrl_cell.X.A stim_cell.X = stim_cell.X.A predicted_cells = predict(train_real_cd, train_real_stimulated, ctrl_cell.X) print("Prediction has been finished") all_Data = sc.AnnData(np.concatenate([ctrl_cell.X, stim_cell.X, predicted_cells])) all_Data.obs["condition"] = ["ctrl"] * ctrl_cell.shape[0] + ["real_stim"] * stim_cell.shape[0] + \ ["pred_stim"] * len(predicted_cells) all_Data.var_names = ctrl_cell.var_names if p_type == "unbiased": sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T.h5ad", all_Data) else: sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T_biased.h5ad", all_Data)
def write(adata, version, name): '''write adata into [name]''' name = version + name sc.write(name, adata) print("_".join(name.split(".")) + " = '%s'" % name)
import scanpy.api as sc import scipy.sparse as sp_sparse # andata = sc.read_h5ad("./ExprMatrix.h5ad") andata = sc.read_h5ad("./100_test_data.h5ad") print("Finished reading.") andata.var_names_make_unique() if sp_sparse.issparse(andata.X): andata.X = andata.X.toarray() # andata = andata partial_data = andata[:100, :] print("Finished processing") sc.write("100_test_data.h5ad", partial_data) print("Finished writing.")