def test_read_write_h5ad(self): adata = pg.read_input( "tests/pegasus-test-data/input/hgmm_1k_v3_filtered_feature_bc_matrix/" ) pg.write_output(adata, "test.h5ad") adata2 = pg.read_input("test.h5ad") assert_adata_equal(self, adata, adata2)
def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key): if adata is not None: adata.obsm['X_torch'] = Z_torch adata.obsm['X_py'] = Z_py adata.obsm['X_harmony'] = Z_R pg.neighbors(adata, rep = 'torch') pg.umap(adata, rep = 'torch', out_basis = 'umap_torch') pg.neighbors(adata, rep = 'py') pg.umap(adata, rep = 'py', out_basis = 'umap_py') pg.neighbors(adata, rep = 'harmony') pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony') pg.write_output(adata, "./result/{}_result".format(prefix)) else: print("Use precalculated AnnData result.") if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format(name = prefix, attr = batch_key)): sys.exit(1)
def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key): if adata is not None: adata.obsm['X_cpu'] = Z_cpu adata.obsm['X_gpu'] = Z_gpu adata.obsm['X_harmony'] = Z_R pg.neighbors(adata, rep = 'cpu') pg.umap(adata, rep = 'cpu', out_basis = 'umap_cpu') pg.neighbors(adata, rep = 'gpu') pg.umap(adata, rep = 'gpu', out_basis = 'umap_gpu') pg.neighbors(adata, rep = 'harmony') pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony') pg.write_output(adata, "./result/{}_result".format(prefix)) else: print("Use precalculated AnnData result.") if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1) if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format(attr = batch_key, prefix = prefix)): sys.exit(1)
def test_read_write_old_5ad_backed_whitelist(self): shutil.copy( "tests/pegasus-test-data/input/test_obsm_compound.h5ad", "test_obsm_compound.h5ad", ) adata = pg.read_input("test_obsm_compound.h5ad", h5ad_mode="r+") pg.write_output(adata, "test_obsm_compound.h5ad", whitelist=["obs"]) adata2 = pg.read_input("test_obsm_compound.h5ad") assert_adata_equal(self, adata, adata2)
def write_dataset(ds, path, output_format='txt'): path = str(path) if not path.lower().endswith('.' + output_format): path += '.' + output_format if output_format == 'txt': x = ds.X.toarray() if scipy.sparse.isspmatrix(ds.X) else ds.X pd.DataFrame(x, index=ds.obs.index, columns=ds.var.index).to_csv(path, index_label='id', sep='\t', doublequote=False) else: pg.write_output(ds, path)
def test_write_mtx(self): adata = pg.read_input( "tests/pegasus-test-data/input/heart_1k_v3/filtered_feature_bc_matrix.h5" ) adata.var['test'] = 1.0 adata.obs['test'] = 1.0 output_dir = 'test_mtx/mm10' pg.write_output(adata, os.path.join(output_dir, 'matrix.mtx.gz')) adata2 = pg.read_input(output_dir) del adata2.obs['Channel'] # get channel from csv adata2.obs = adata2.obs.join( pd.read_csv(os.path.join(output_dir, 'obs.csv.gz'), index_col=0)) adata2.var = adata2.var.join( pd.read_csv(os.path.join(output_dir, 'var.csv.gz'), index_col=0)) del adata2.var['featuretype'] assert_adata_equal(self, adata, adata2, obs_blacklist=['Channel'])
if __name__ == "__main__": import pandas as pd import pegasus as pg import argparse parser = argparse.ArgumentParser( description='Update the X_pca with the results of harmony') parser.add_argument('h5ad_filename', type=str) parser.add_argument('harmony_csv', type=str) parser.add_argument('output', type=str) args = parser.parse_args() args = args.__dict__ pca = pd.read_csv(args["harmony_csv"]) pca = pca.values.T[1:] # remove the id pf the pc adata = pg.read_input(args["h5ad_filename"]) adata.obsm["X_pca"] = pca pg.write_output(adata, args["output"])
def main(): args = my_args() out = args.output command = "pegasus aggregate_matrix %s %s" % (args.input_csv, out) os.system(command) zarr_file = "%s.zarr.zip" % (out) data = pg.read_input(zarr_file) if args.citeseq: data.select_data("%s-rna" % (data.uns['genome'])) pg.qc_metrics(data, percent_mito=args.MT_percent, mito_prefix=args.MT_prefix, max_genes=args.max_genes) df_qc = pg.get_filter_stats(data) df_qc.to_csv("%s_qc_get_filter_stats.csv" % (out)) pg.qcviolin(data, plot_type='gene') plt.savefig("%s_qcviolin_gene.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='count') plt.savefig("%s_qcviolin_UMI_count.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='mito') plt.savefig("%s_qcviolin_UMI_mito.pdf" % (out), bbox_inches='tight') # filtering pg.filter_data(data) pg.identify_robust_genes(data, percent_cells=0.05) pg.log_norm(data) print(data.obs['Channel'].value_counts()) # save log norm data, rna df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.rna.log_norm.pkl" % (out)) if args.citeseq: data.select_data("%s-citeseq" % (data.uns['genome'])) df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.antibody.log_norm.pkl" % (out)) data.select_data("%s-rna" % (data.uns['genome'])) # without batch correction data_baseline = data.copy() pg.highly_variable_features(data_baseline, consider_batch=False, n_top=4000) data_baseline.var.loc[ data_baseline.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data_baseline) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') pg.pca(data_baseline, n_components=200) pg.neighbors(data_baseline, K=200) pg.louvain(data_baseline, resolution=2) pg.umap(data_baseline, n_neighbors=10, min_dist=0.4) pg.scatter(data_baseline, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_without_BC.pdf" % (out), bbox_inches='tight') # with batch correction pg.highly_variable_features(data, consider_batch=True, n_top=4000) data.var.loc[data.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') data_harmony = data.copy() pg.pca(data_harmony, n_components=200) harmony_key = pg.run_harmony(data_harmony) pg.neighbors(data_harmony, rep=harmony_key, K=200) pg.louvain(data_harmony, rep=harmony_key, resolution=2) pg.umap(data_harmony, rep=harmony_key, n_neighbors=10, min_dist=0.4) pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_Harmony_BC.pdf" % (out), bbox_inches='tight') pg.write_output(data_harmony, "%s_harmony.zarr" % (out)) ddf = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf.index = data_harmony.obs.index.tolist() ddf.columns = data_harmony.var.index.tolist() data_harmony.select_data("%s-citeseq" % (data_harmony.uns['genome'])) ddf2 = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf2.index = data_harmony.obs.index.tolist() ddf2.columns = data_harmony.var.index.tolist() df_all = pd.concat([ddf, ddf2], axis=1) df_all = df_all.sparse.to_dense() df_all = df_all.round(3) df_all.to_csv("%s.Harmony_correction.data.csv" % (out)) ### original harmony UMAP data out = data_harmony.obs.copy() out['UMAP1'] = data_harmony.obsm['X_umap'][:, 0] out['UMAP2'] = data_harmony.obsm['X_umap'][:, 1] from anndata import AnnData ann = AnnData(X=out[['UMAP1', 'UMAP2']], obs=out[['Channel', 'louvain_labels']]) import scanpy as sc from matplotlib import rcParams sc.pl.scatter(ann, x="UMAP1", y="UMAP2", color='louvain_labels', legend_loc='on data', legend_fontsize=12, legend_fontoutline=2, frameon=False, title='clustering of cells') plt.savefig("%s_Scapy_UMAP.png" % (args.output), bbox_inches='tight') out.to_csv("%s_Harmony_UMAP.csv" % (args.output))
def test_read_write_old_5ad(self): adata = pg.read_input( "tests/pegasus-test-data/input/test_obsm_compound.h5ad") pg.write_output(adata, "test.h5ad") adata2 = pg.read_input("test.h5ad") assert_adata_equal(self, adata, adata2)