def rand_index(adata, title): if not os.path.exists('RandInd_dictionaries'): os.makedirs('RandInd_dictionaries') resamp_perc = 0.9 adata = adata.copy() indx_array = adata.obs.index.values n_cells = range(adata.shape[0]) resamp_size = round(adata.shape[0] * resamp_perc) for res in [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.1]:#, 1.3,1.5, 1.7, 1.9]: print(res) rand_indx_dict = {} pg.neighbors(adata,rep="pca_harmony") pg.leiden(adata, rep="pca_harmony", resolution = res) rand_list = [] for iter in range(20) : samp_indx = random.sample(n_cells, resamp_size) samp_indx = indx_array[samp_indx] samp_data = adata[samp_indx] true_class = samp_data.obs["leiden_labels"] pg.neighbors(samp_data, rep="pca_harmony") pg.leiden(samp_data, rep = "pca_harmony", resolution = res) new_class = samp_data.obs["leiden_labels"] rand_list.append(adjusted_rand_score(true_class, new_class)) rand_indx_dict[str(res)] = rand_list file_name = "RandInd_dictionaries/Dict_"+ title +"_"+str(res)+".pckl" filehandler = open(file_name,"wb") pickle.dump(rand_indx_dict, filehandler) filehandler.close()
def test_cell_lines(): print("Testing on Cell Lines...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") if os.path.exists("./result/cell_lines_cpu_z.npy"): Z_cpu = np.load("./result/cell_lines_cpu_z.npy") print("Precalculated CPU mode result is loaded.") else: start_cpu = time.time() Z_cpu = harmonize(X, df_metadata, 'dataset') end_cpu = time.time() print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) np.save("./result/cell_lines_cpu_z.npy", Z_cpu) if os.path.exists("./result/cell_lines_gpu_z.npy"): Z_gpu = np.load("./result/cell_lines_gpu_z.npy") print("Precalculated GPU mode result is loaded.") else: start_gpu = time.time() Z_gpu = harmonize(X, df_metadata, 'dataset', use_gpu = True) end_gpu = time.time() print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) np.save("./result/cell_lines_gpu_z.npy", Z_gpu) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metrics(Z_cpu, Z_R, prefix = "cell_lines_cpu") check_metrics(Z_gpu, Z_R, prefix = "cell_lines_gpu") if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")] if len(umap_list) < 4: plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "cell_lines", batch_key = 'dataset')
def subcluster_1_preprocess(data, hvg_no, PCs_no, tit, in_place=False): #adata.obsm = None # Removing former PCA and umap values data.uns.clear() # To eliminate problems with size of uns (specifically "fmat_highly_variable_features") if not in_place: adata=data.copy() else: adata = data # Preprocessing pg.qc_metrics(adata, min_genes=200, min_umis=400) pg.highly_variable_features(adata, consider_batch=False, n_top=hvg_no) pg.pca(adata, n_components=PCs_no) adata.obs['Channel'] = adata.obs['sample'] pg.run_harmony(adata) pg.neighbors(adata, rep="pca_harmony") pg.umap(adata, rep="pca_harmony") pg.leiden(adata, rep="pca_harmony", resolution=0.5) sc.pl.umap(adata, size=15, title=tit + " (#HVG: " + str(hvg_no) + ", #PCs: " + str(PCs_no) + ")", color="leiden_labels")
def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key): if adata is not None: adata.obsm['X_torch'] = Z_torch adata.obsm['X_py'] = Z_py adata.obsm['X_harmony'] = Z_R pg.neighbors(adata, rep = 'torch') pg.umap(adata, rep = 'torch', out_basis = 'umap_torch') pg.neighbors(adata, rep = 'py') pg.umap(adata, rep = 'py', out_basis = 'umap_py') pg.neighbors(adata, rep = 'harmony') pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony') pg.write_output(adata, "./result/{}_result".format(prefix)) else: print("Use precalculated AnnData result.") if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1)
def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key): if adata is not None: adata.obsm['X_cpu'] = Z_cpu adata.obsm['X_gpu'] = Z_gpu adata.obsm['X_harmony'] = Z_R pg.neighbors(adata, rep = 'cpu') pg.umap(adata, rep = 'cpu', out_basis = 'umap_cpu') pg.neighbors(adata, rep = 'gpu') pg.umap(adata, rep = 'gpu', out_basis = 'umap_gpu') pg.neighbors(adata, rep = 'harmony') pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony') pg.write_output(adata, "./result/{}_result".format(prefix)) else: print("Use precalculated AnnData result.") if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1)
def test_cell_lines(): print("Testing on cell lines dataset...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") source_loaded = True if os.path.exists("./result/cell_lines_torch_z.npy"): Z_torch = np.load("./result/cell_lines_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(X, df_metadata, batch_key = 'dataset') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/cell_lines_torch_z.npy", Z_torch) if os.path.exists("./result/cell_lines_py_z.npy"): Z_py = np.load("./result/cell_lines_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(X, df_metadata, ['dataset']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) print(ho.objective_harmony) Z_py = np.transpose(ho.Z_corr) np.save("./result/cell_lines_py_z.npy", Z_py) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2') if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset") if os.path.exists("./result/cell_lines_result.h5ad"): adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r') stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony') print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py') print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch') print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
def main(): args = my_args() out = args.output command = "pegasus aggregate_matrix %s %s" % (args.input_csv, out) os.system(command) zarr_file = "%s.zarr.zip" % (out) data = pg.read_input(zarr_file) if args.citeseq: data.select_data("%s-rna" % (data.uns['genome'])) pg.qc_metrics(data, percent_mito=args.MT_percent, mito_prefix=args.MT_prefix, max_genes=args.max_genes) df_qc = pg.get_filter_stats(data) df_qc.to_csv("%s_qc_get_filter_stats.csv" % (out)) pg.qcviolin(data, plot_type='gene') plt.savefig("%s_qcviolin_gene.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='count') plt.savefig("%s_qcviolin_UMI_count.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='mito') plt.savefig("%s_qcviolin_UMI_mito.pdf" % (out), bbox_inches='tight') # filtering pg.filter_data(data) pg.identify_robust_genes(data, percent_cells=0.05) pg.log_norm(data) print(data.obs['Channel'].value_counts()) # save log norm data, rna df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.rna.log_norm.pkl" % (out)) if args.citeseq: data.select_data("%s-citeseq" % (data.uns['genome'])) df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.antibody.log_norm.pkl" % (out)) data.select_data("%s-rna" % (data.uns['genome'])) # without batch correction data_baseline = data.copy() pg.highly_variable_features(data_baseline, consider_batch=False, n_top=4000) data_baseline.var.loc[ data_baseline.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data_baseline) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') pg.pca(data_baseline, n_components=200) pg.neighbors(data_baseline, K=200) pg.louvain(data_baseline, resolution=2) pg.umap(data_baseline, n_neighbors=10, min_dist=0.4) pg.scatter(data_baseline, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_without_BC.pdf" % (out), bbox_inches='tight') # with batch correction pg.highly_variable_features(data, consider_batch=True, n_top=4000) data.var.loc[data.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') data_harmony = data.copy() pg.pca(data_harmony, n_components=200) harmony_key = pg.run_harmony(data_harmony) pg.neighbors(data_harmony, rep=harmony_key, K=200) pg.louvain(data_harmony, rep=harmony_key, resolution=2) pg.umap(data_harmony, rep=harmony_key, n_neighbors=10, min_dist=0.4) pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_Harmony_BC.pdf" % (out), bbox_inches='tight') pg.write_output(data_harmony, "%s_harmony.zarr" % (out)) ddf = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf.index = data_harmony.obs.index.tolist() ddf.columns = data_harmony.var.index.tolist() data_harmony.select_data("%s-citeseq" % (data_harmony.uns['genome'])) ddf2 = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf2.index = data_harmony.obs.index.tolist() ddf2.columns = data_harmony.var.index.tolist() df_all = pd.concat([ddf, ddf2], axis=1) df_all = df_all.sparse.to_dense() df_all = df_all.round(3) df_all.to_csv("%s.Harmony_correction.data.csv" % (out)) ### original harmony UMAP data out = data_harmony.obs.copy() out['UMAP1'] = data_harmony.obsm['X_umap'][:, 0] out['UMAP2'] = data_harmony.obsm['X_umap'][:, 1] from anndata import AnnData ann = AnnData(X=out[['UMAP1', 'UMAP2']], obs=out[['Channel', 'louvain_labels']]) import scanpy as sc from matplotlib import rcParams sc.pl.scatter(ann, x="UMAP1", y="UMAP2", color='louvain_labels', legend_loc='on data', legend_fontsize=12, legend_fontoutline=2, frameon=False, title='clustering of cells') plt.savefig("%s_Scapy_UMAP.png" % (args.output), bbox_inches='tight') out.to_csv("%s_Harmony_UMAP.csv" % (args.output))
q = q / q.sum() adata_subsamp += [ adata_[np.random.choice(adata_.shape[0], size=args.numcells, p=q), :], ] # create perturbed anndata adata_s = adata_subsamp[0].concatenate(adata_subsamp[1:]) days = adata_s.obs.day.unique() days_tot = adata_s.obs.day.unique().shape[0] # sampled perturbed proportions props_subsamp = [ adata_s[adata_s.obs.day == i, :].obs.iloc[:, 3:].sum(0) for i in t_map ] props_subsamp = [p / p.sum() for p in props_subsamp] # compute PCA after all subsampling is done pg.pca(adata_s, n_components=args.pcadim, features=None) pg.neighbors(adata_s) pg.diffmap(adata_s) adata_s.obsm['X_fle'] = np.array(adata_s.obsm['X_fle']) # compute cost scale c_means = np.array([ gwot.anndata_utils.get_C_mean(adata_s, t_map[i], t_next=t_map[i + 1], mode="tr") for i in range(0, len(t_map[:-1])) ]) c_means_self = np.array( [gwot.anndata_utils.get_C_mean(adata_s, t, mode="self") for t in t_map]) dt = np.array([t_map[i + 1] - t_map[i] for i in range(0, len(t_map) - 1)])