예제 #1
0
def cluser_enrich(enr,gl,pval=0.05, top_clusters=20):
    #2= Filter terms by p-Val
    enr = enr[enr['p-Val']<pval]

    #3= Make claster by kappa coeff
    enr = erl.cluster(set(gl), enr, deep=2)

    #3-1= Filter top clusters
    enr = enr[enr['cluster']<top_clusters]

    #4= Make clustered geneset 
    gs_clust,nt_cl = erl.cluster_genset(enr)

    #5= Enrich clustered geneset
    enr_clust = erl.enrich(gl,gs_clust)
    
    # deduplicate index -- TODO!!! in package
    nt_cl = nt_cl.loc[~nt_cl.index.duplicated(keep='first')]

    #6= Add cluster to table
    enr_clust = pd.concat([enr_clust,nt_cl.loc[enr_clust.index]['cluster']],axis=1, sort=False)

    #7= Make graphs
    G_gs = erl.make_graph_n(gl,enr, kappa=0.4)
    G_cl = erl.make_graph_n(gl,enr_clust, kappa=0.01)

    #8= Draw graphs
    erl.draw_graph(G_gs, spring=150)
    erl.draw_direct(G_cl)



    #9= Draw barplot for clustered terms
    enr.sort_values('cluster', axis=0, inplace = True)

    cm = ('tab20' if max(enr['cluster'])>10  else 'tab10')

    f, ax = plt.subplots(figsize=(8, 12))
    sns.barplot(y=enr.index,
                x='-log10(p-Val)',
                ax = ax, 
                hue ='cluster',
                dodge=False,
                data = enr,
                palette = cm)
    ax.set_title('Top terms in clusters ')
예제 #2
0

exp_name='GO_KEGG2'



#~ gt = pd.read_table('tracks/MARGE/relativeRP/DN_RP_enhancers_genes.txt')


gl = [  'CD3E', 'BLK', 'PTPN22', 'PAG1', 'CTLA4', 'PIK3CD', 'LAT2', 'CSK', 
        'CD247', 'CD3G', 'THEMIS', 'PSMB8', 'LCP2', 'GATA3', 'LAT', 'SLA2', 
        'SKAP1', 'TRAT1', 'BCL2', 'CD3D', 'THY1','RUNX1', 'BLK', 'PTPN22']

#setR = set(gl)
gs_fn ='EnrichrLibs/Reactome_2016.gmt'
gs = erl.read_gmt(gs_fn)
enr = erl.enrich(set(gl), gs)


#get_Enrichr(out_dir = 'EnrichrLibs')

# Several gene sets in one
gss = [ 
       'GO_Biological_Process_2018',
       'GO_Cellular_Component_2018',
       'GO_Molecular_Function_2018',
       'KEGG_2016',
       'Reactome_2016'
       ]

예제 #3
0
# # -- transform
# rp = rpt.sort_values('lgFC_TLXvsRAG', axis=0, ascending=False)
# rp.drop_duplicates(subset='gene_name', inplace=True)

# Ap,Bp = 'TLX_rel_RP','RAG_rel_RP'
# cols = ['gene_name', Ap, Bp]

# rp = rp[cols]
# rp = rp.set_index(keys=rp.columns[0])

# ### Annotations: genes, tss, T-all oncogenes etc.

# In[9]:

# Load T-ALL ocnogenes
tall = erl.read_gmt(join(DATADIR, 'gene_lists/Cancermine/T-ALL.gmt'))
df_mut = pd.read_csv(
    join(
        DATADIR,
        'gene_lists/COSMIC/Genes_mutation_HUMAN_Acute-lymphoblastic-leukaemia.csv'
    ))

df_mut['Gname'] = df_mut['Gene name'].apply(lambda x: x.split('_')[0])

tall_onc = tall['T-ALL all']
tall_mut = list(df_mut['Gname'].unique())

# Load genes body regions
genes = pb.BedTool(
    join(DATADIR, 'tracks/annot_tracks/references/mm9/mm9.refGene.bed'))
예제 #4
0
# In[9]:

# List of gene sets as above
#~ gss = [
#~ 'GO_Biological_Process_2018',
#~ 'GO_Cellular_Component_2018',
#~ 'GO_Molecular_Function_2018',
#~ 'KEGG_2016',
#~ 'Reactome_2016'
#~ ]

# ### Batch enrichment

# In[10]:

enrr = erl.enrich_gs(gl, gss, path_lib=lib_dir)

# ### Plots

# In[11]:

#~ enrr.sort_values('p-Val', axis=0, inplace = True)
#~ ds = enrr.head(20)

#~ f, ax = plt.subplots()
#~ sns.barplot(y=ds.index,
#~ x='-log10(p-Val)',
#~ ax = ax,
#~ color="Red",
#~ data = ds)
#~ ax.set_title('All terms')
예제 #5
0
lib_dir='_tmp'


gss = [ 
       'GO_Biological_Process_2018',
       #~ 'GO_Cellular_Component_2018',
       #~ 'GO_Molecular_Function_2018',
       'KEGG_2016',
       #'Reactome_2016'
       ]



#1= Enrich  all terms
enr = erl.enrich_gs(gl,gss, path_lib=lib_dir)

#2= Filter terms by p-Val
enr = enr[enr['p-Val']<0.005]

#3= Make claster by kappa coeff
enr = erl.cluster(set(gl), enr, deep=2)

#3-1= Filter top clusters
top_clusters = 18
enr = enr[enr['cluster']<top_clusters]

#4= Make clustered geneset 
gs_clust,nt_cl = erl.cluster_genset(enr)

#5= Enrich clustered geneset
예제 #6
0
## Enrichment analysis
# List of gene sets
gss = [
    #~ 'GO_Biological_Process_2018',
    #~ 'GO_Cellular_Component_2018',
    #~ 'GO_Molecular_Function_2018',
    #~ 'KEGG_2016',
    #~ 'Reactome_2016',
    'Cancer_Cell_Line_Encyclopedia',
    'NCI-60_Cancer_Cell_Lines',
    'MSigDB_Computational',
    'MSigDB_Oncogenic_Signatures'
]

enr = erl.enrich_gs(gl, gss, path_lib='../data/EnrichrLibs')

# For futher analysis it is convinient to filter terms by p-value
enr_a = enr[enr['p-Val'] < 0.05]

G, enr_out, nt = erl.make_graph(gl, enr_a, kappa=0.4)

enr_out.sort_values('cluster', axis=0, inplace=True)

# --- Plot ---
cm = 'tab20'
erl.draw_graph(G, spring=500, pval_prcnt=0.7, palette=cm)

ds = enr_out.head(40)

f, ax = plt.subplots(figsize=(12, 12))
예제 #7
0
setR = set(list(gt.iloc[:, 0]))

gss = [
    #~ 'Pierre_gene_sets.gmt',
    'GO_Biological_Process_2018.gmt',
    'GO_Cellular_Component_2018.gmt',
    'GO_Molecular_Function_2018.gmt',
    'KEGG_2016.gmt',
    'Reactome_2016.gmt'
]

enrr = pd.DataFrame()

for gs in gss:
    pl = erl.read_gmt('EnrichrLibs/' + gs)
    enr = erl.enrich(setR, pl)
    print(len(enr))
    enrr = pd.concat([enrr, enr])
    #~ ds = enr.head(20)
    #~ f, ax = plt.subplots(figsize=(16.5, 6.5))
    #~ sns.barplot(y=ds.index,
    #~ x='-log10(p-Val)',
    #~ ax = ax,
    #~ color="Red",
    #~ data = ds)
    #~ ax.set_title(gs.split(".")[0])

# p-Value filter
enrr = enrr[enrr['p-Val'] < 0.005]
예제 #8
0
    'MSigDB_Computational',
    'MSigDB_Oncogenic_Signatures',
    #'Mouse_Gene_Atlas',
    'NCI-60_Cancer_Cell_Lines',
    #'NCI-Nature_2016',
    #'OMIM_Disease',
    'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
    'Reactome_2016',
    #'WikiPathways_2016'
]

# == UP analysis

upTLX_list = [x.upper() for x in upTLX_list]

enrUP_tlx = erl.enrich_gs(upTLX_list, gss)

# --- Plot ---
enrUP_tlx.sort_values('p-Val', axis=0, inplace=True)
ds = enrUP_tlx.head(20)

f, ax = plt.subplots()
sns.barplot(y=ds.index, x='-log10(p-Val)', ax=ax, color="Red", data=ds)
ax.set_title('UP_dRP_Tlx_peaks')

if SAVE:
    plt.savefig(pp, format='pdf')

# == DN analysis

dnTLX_list = [x.upper() for x in dnTLX_list]
예제 #9
0
    'ChEA_2016',
    'Disease_Perturbations_from_GEO_down',
    'Disease_Perturbations_from_GEO_up',
    'Disease_Signatures_from_GEO_down_2014',
    'Disease_Signatures_from_GEO_up_2014',
    'GO_Biological_Process_2018',
    'GO_Cellular_Component_2018',
    'GO_Molecular_Function_2018',
    'Human_Phenotype_Ontology',
    'KEGG_2016',
    'MSigDB_Computational',
    'MSigDB_Oncogenic_Signatures',
    'Mouse_Gene_Atlas',
    'NCI-60_Cancer_Cell_Lines',
    'NCI-Nature_2016',
    #'OMIM_Disease',
    'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
    'Reactome_2016',
    'WikiPathways_2016'
]

enrDD = erl.enrich_gs(gup, gss)

enrDD.sort_values('p-Val', axis=0, inplace=True)
ds = enrDD.head(20)
f, ax = plt.subplots(figsize=(16.5, 6.5))
sns.barplot(y=ds.index, x='-log10(p-Val)', ax=ax, color="Red", data=ds)
ax.set_title('All terms')

plt.show()
예제 #10
0
# my libs
import EnrichRLib as erl


# Project settings
from os.path import join 
WORKDIR = '/home/sergio/Res_CIML/TLX3_project'
SCRIPTS = join(WORKDIR,'scripts')
DATADIR = join(WORKDIR,'data')
WGS = join(DATADIR,'tracks/WGS-WES/Germline')
RP = join(DATADIR,'tracks/MARGE/relativeRP/bam_input')

# Load genes body regions
genes = pb.BedTool(join(DATADIR,'tracks/annot_tracks/references/mm9/mm9.refGene.bed'))
tall = erl.read_gmt(join(DATADIR,'gene_lists/Cancermine/T-ALL.gmt')) 

# Genes of interests

gl =     ['BCL11B',
        'EZH2',
        'RUNX1',
        'FBXW7',
        'FOS',
        'ETV6',
        'RPL5',
        'RB1',
        'GATA3',
        'LEF1',
        'TET1',
        'PTEN',