Ejemplo n.º 1
0
 def test_read_write_h5ad(self):
     adata = pg.read_input(
         "tests/pegasus-test-data/input/hgmm_1k_v3_filtered_feature_bc_matrix/"
     )
     pg.write_output(adata, "test.h5ad")
     adata2 = pg.read_input("test.h5ad")
     assert_adata_equal(self, adata, adata2)
Ejemplo n.º 2
0
def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key):
    if adata is not None:
        adata.obsm['X_torch'] = Z_torch
        adata.obsm['X_py'] = Z_py
        adata.obsm['X_harmony'] = Z_R

        pg.neighbors(adata, rep = 'torch')
        pg.umap(adata, rep = 'torch', out_basis = 'umap_torch')

        pg.neighbors(adata, rep = 'py')
        pg.umap(adata, rep = 'py', out_basis = 'umap_py')

        pg.neighbors(adata, rep = 'harmony')
        pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony')

        pg.write_output(adata, "./result/{}_result".format(prefix))
    else:
        print("Use precalculated AnnData result.")

    if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)
Ejemplo n.º 3
0
def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key):
    if adata is not None:
        adata.obsm['X_cpu'] = Z_cpu
        adata.obsm['X_gpu'] = Z_gpu
        adata.obsm['X_harmony'] = Z_R

        pg.neighbors(adata, rep = 'cpu')
        pg.umap(adata, rep = 'cpu', out_basis = 'umap_cpu')

        pg.neighbors(adata, rep = 'gpu')
        pg.umap(adata, rep = 'gpu', out_basis = 'umap_gpu')

        pg.neighbors(adata, rep = 'harmony')
        pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony')

        pg.write_output(adata, "./result/{}_result".format(prefix))
    else:
        print("Use precalculated AnnData result.")

    if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)
Ejemplo n.º 4
0
 def test_read_write_old_5ad_backed_whitelist(self):
     shutil.copy(
         "tests/pegasus-test-data/input/test_obsm_compound.h5ad",
         "test_obsm_compound.h5ad",
     )
     adata = pg.read_input("test_obsm_compound.h5ad", h5ad_mode="r+")
     pg.write_output(adata, "test_obsm_compound.h5ad", whitelist=["obs"])
     adata2 = pg.read_input("test_obsm_compound.h5ad")
     assert_adata_equal(self, adata, adata2)
Ejemplo n.º 5
0
def write_dataset(ds, path, output_format='txt'):
    path = str(path)
    if not path.lower().endswith('.' + output_format):
        path += '.' + output_format
    if output_format == 'txt':
        x = ds.X.toarray() if scipy.sparse.isspmatrix(ds.X) else ds.X
        pd.DataFrame(x, index=ds.obs.index,
                     columns=ds.var.index).to_csv(path,
                                                  index_label='id',
                                                  sep='\t',
                                                  doublequote=False)
    else:
        pg.write_output(ds, path)
Ejemplo n.º 6
0
 def test_write_mtx(self):
     adata = pg.read_input(
         "tests/pegasus-test-data/input/heart_1k_v3/filtered_feature_bc_matrix.h5"
     )
     adata.var['test'] = 1.0
     adata.obs['test'] = 1.0
     output_dir = 'test_mtx/mm10'
     pg.write_output(adata, os.path.join(output_dir, 'matrix.mtx.gz'))
     adata2 = pg.read_input(output_dir)
     del adata2.obs['Channel']  # get channel from csv
     adata2.obs = adata2.obs.join(
         pd.read_csv(os.path.join(output_dir, 'obs.csv.gz'), index_col=0))
     adata2.var = adata2.var.join(
         pd.read_csv(os.path.join(output_dir, 'var.csv.gz'), index_col=0))
     del adata2.var['featuretype']
     assert_adata_equal(self, adata, adata2, obs_blacklist=['Channel'])
Ejemplo n.º 7
0
if __name__ == "__main__":
    import pandas as pd
    import pegasus as pg
    import argparse

    parser = argparse.ArgumentParser(
        description='Update the X_pca with the results of harmony')
    parser.add_argument('h5ad_filename', type=str)
    parser.add_argument('harmony_csv', type=str)
    parser.add_argument('output', type=str)

    args = parser.parse_args()
    args = args.__dict__

    pca = pd.read_csv(args["harmony_csv"])
    pca = pca.values.T[1:]  # remove the id pf the pc

    adata = pg.read_input(args["h5ad_filename"])
    adata.obsm["X_pca"] = pca
    pg.write_output(adata, args["output"])
Ejemplo n.º 8
0
def main():

    args = my_args()

    out = args.output
    command = "pegasus aggregate_matrix %s %s" % (args.input_csv, out)
    os.system(command)
    zarr_file = "%s.zarr.zip" % (out)

    data = pg.read_input(zarr_file)
    if args.citeseq:
        data.select_data("%s-rna" % (data.uns['genome']))
    pg.qc_metrics(data,
                  percent_mito=args.MT_percent,
                  mito_prefix=args.MT_prefix,
                  max_genes=args.max_genes)
    df_qc = pg.get_filter_stats(data)
    df_qc.to_csv("%s_qc_get_filter_stats.csv" % (out))

    pg.qcviolin(data, plot_type='gene')
    plt.savefig("%s_qcviolin_gene.pdf" % (out), bbox_inches='tight')

    pg.qcviolin(data, plot_type='count')
    plt.savefig("%s_qcviolin_UMI_count.pdf" % (out), bbox_inches='tight')

    pg.qcviolin(data, plot_type='mito')
    plt.savefig("%s_qcviolin_UMI_mito.pdf" % (out), bbox_inches='tight')

    # filtering
    pg.filter_data(data)
    pg.identify_robust_genes(data, percent_cells=0.05)
    pg.log_norm(data)

    print(data.obs['Channel'].value_counts())
    # save log norm data, rna
    df = pd.DataFrame.sparse.from_spmatrix(data.X)
    df.index = data.obs.index.tolist()
    df.columns = data.var.index.tolist()
    df.to_pickle("%s.rna.log_norm.pkl" % (out))

    if args.citeseq:
        data.select_data("%s-citeseq" % (data.uns['genome']))
        df = pd.DataFrame.sparse.from_spmatrix(data.X)
        df.index = data.obs.index.tolist()
        df.columns = data.var.index.tolist()
        df.to_pickle("%s.antibody.log_norm.pkl" % (out))
        data.select_data("%s-rna" % (data.uns['genome']))

    # without batch correction
    data_baseline = data.copy()
    pg.highly_variable_features(data_baseline,
                                consider_batch=False,
                                n_top=4000)
    data_baseline.var.loc[
        data_baseline.var['highly_variable_features']].sort_values(
            by='hvf_rank')

    pg.hvfplot(data_baseline)
    plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight')

    pg.pca(data_baseline, n_components=200)
    pg.neighbors(data_baseline, K=200)
    pg.louvain(data_baseline, resolution=2)
    pg.umap(data_baseline, n_neighbors=10, min_dist=0.4)
    pg.scatter(data_baseline,
               attrs=['louvain_labels', 'Channel'],
               basis='umap')
    plt.savefig("%s_without_BC.pdf" % (out), bbox_inches='tight')

    # with batch correction
    pg.highly_variable_features(data, consider_batch=True, n_top=4000)
    data.var.loc[data.var['highly_variable_features']].sort_values(
        by='hvf_rank')

    pg.hvfplot(data)
    plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight')

    data_harmony = data.copy()
    pg.pca(data_harmony, n_components=200)
    harmony_key = pg.run_harmony(data_harmony)
    pg.neighbors(data_harmony, rep=harmony_key, K=200)
    pg.louvain(data_harmony, rep=harmony_key, resolution=2)
    pg.umap(data_harmony, rep=harmony_key, n_neighbors=10, min_dist=0.4)
    pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap')
    plt.savefig("%s_Harmony_BC.pdf" % (out), bbox_inches='tight')
    pg.write_output(data_harmony, "%s_harmony.zarr" % (out))

    ddf = pd.DataFrame.sparse.from_spmatrix(data_harmony.X)
    ddf.index = data_harmony.obs.index.tolist()
    ddf.columns = data_harmony.var.index.tolist()
    data_harmony.select_data("%s-citeseq" % (data_harmony.uns['genome']))
    ddf2 = pd.DataFrame.sparse.from_spmatrix(data_harmony.X)
    ddf2.index = data_harmony.obs.index.tolist()
    ddf2.columns = data_harmony.var.index.tolist()
    df_all = pd.concat([ddf, ddf2], axis=1)
    df_all = df_all.sparse.to_dense()
    df_all = df_all.round(3)
    df_all.to_csv("%s.Harmony_correction.data.csv" % (out))
    ### original harmony UMAP data
    out = data_harmony.obs.copy()
    out['UMAP1'] = data_harmony.obsm['X_umap'][:, 0]
    out['UMAP2'] = data_harmony.obsm['X_umap'][:, 1]
    from anndata import AnnData
    ann = AnnData(X=out[['UMAP1', 'UMAP2']],
                  obs=out[['Channel', 'louvain_labels']])
    import scanpy as sc
    from matplotlib import rcParams
    sc.pl.scatter(ann,
                  x="UMAP1",
                  y="UMAP2",
                  color='louvain_labels',
                  legend_loc='on data',
                  legend_fontsize=12,
                  legend_fontoutline=2,
                  frameon=False,
                  title='clustering of cells')
    plt.savefig("%s_Scapy_UMAP.png" % (args.output), bbox_inches='tight')
    out.to_csv("%s_Harmony_UMAP.csv" % (args.output))
Ejemplo n.º 9
0
 def test_read_write_old_5ad(self):
     adata = pg.read_input(
         "tests/pegasus-test-data/input/test_obsm_compound.h5ad")
     pg.write_output(adata, "test.h5ad")
     adata2 = pg.read_input("test.h5ad")
     assert_adata_equal(self, adata, adata2)