def cluser_enrich(enr,gl,pval=0.05, top_clusters=20): #2= Filter terms by p-Val enr = enr[enr['p-Val']<pval] #3= Make claster by kappa coeff enr = erl.cluster(set(gl), enr, deep=2) #3-1= Filter top clusters enr = enr[enr['cluster']<top_clusters] #4= Make clustered geneset gs_clust,nt_cl = erl.cluster_genset(enr) #5= Enrich clustered geneset enr_clust = erl.enrich(gl,gs_clust) # deduplicate index -- TODO!!! in package nt_cl = nt_cl.loc[~nt_cl.index.duplicated(keep='first')] #6= Add cluster to table enr_clust = pd.concat([enr_clust,nt_cl.loc[enr_clust.index]['cluster']],axis=1, sort=False) #7= Make graphs G_gs = erl.make_graph_n(gl,enr, kappa=0.4) G_cl = erl.make_graph_n(gl,enr_clust, kappa=0.01) #8= Draw graphs erl.draw_graph(G_gs, spring=150) erl.draw_direct(G_cl) #9= Draw barplot for clustered terms enr.sort_values('cluster', axis=0, inplace = True) cm = ('tab20' if max(enr['cluster'])>10 else 'tab10') f, ax = plt.subplots(figsize=(8, 12)) sns.barplot(y=enr.index, x='-log10(p-Val)', ax = ax, hue ='cluster', dodge=False, data = enr, palette = cm) ax.set_title('Top terms in clusters ')
exp_name='GO_KEGG2' #~ gt = pd.read_table('tracks/MARGE/relativeRP/DN_RP_enhancers_genes.txt') gl = [ 'CD3E', 'BLK', 'PTPN22', 'PAG1', 'CTLA4', 'PIK3CD', 'LAT2', 'CSK', 'CD247', 'CD3G', 'THEMIS', 'PSMB8', 'LCP2', 'GATA3', 'LAT', 'SLA2', 'SKAP1', 'TRAT1', 'BCL2', 'CD3D', 'THY1','RUNX1', 'BLK', 'PTPN22'] #setR = set(gl) gs_fn ='EnrichrLibs/Reactome_2016.gmt' gs = erl.read_gmt(gs_fn) enr = erl.enrich(set(gl), gs) #get_Enrichr(out_dir = 'EnrichrLibs') # Several gene sets in one gss = [ 'GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'GO_Molecular_Function_2018', 'KEGG_2016', 'Reactome_2016' ]
# # -- transform # rp = rpt.sort_values('lgFC_TLXvsRAG', axis=0, ascending=False) # rp.drop_duplicates(subset='gene_name', inplace=True) # Ap,Bp = 'TLX_rel_RP','RAG_rel_RP' # cols = ['gene_name', Ap, Bp] # rp = rp[cols] # rp = rp.set_index(keys=rp.columns[0]) # ### Annotations: genes, tss, T-all oncogenes etc. # In[9]: # Load T-ALL ocnogenes tall = erl.read_gmt(join(DATADIR, 'gene_lists/Cancermine/T-ALL.gmt')) df_mut = pd.read_csv( join( DATADIR, 'gene_lists/COSMIC/Genes_mutation_HUMAN_Acute-lymphoblastic-leukaemia.csv' )) df_mut['Gname'] = df_mut['Gene name'].apply(lambda x: x.split('_')[0]) tall_onc = tall['T-ALL all'] tall_mut = list(df_mut['Gname'].unique()) # Load genes body regions genes = pb.BedTool( join(DATADIR, 'tracks/annot_tracks/references/mm9/mm9.refGene.bed'))
# In[9]: # List of gene sets as above #~ gss = [ #~ 'GO_Biological_Process_2018', #~ 'GO_Cellular_Component_2018', #~ 'GO_Molecular_Function_2018', #~ 'KEGG_2016', #~ 'Reactome_2016' #~ ] # ### Batch enrichment # In[10]: enrr = erl.enrich_gs(gl, gss, path_lib=lib_dir) # ### Plots # In[11]: #~ enrr.sort_values('p-Val', axis=0, inplace = True) #~ ds = enrr.head(20) #~ f, ax = plt.subplots() #~ sns.barplot(y=ds.index, #~ x='-log10(p-Val)', #~ ax = ax, #~ color="Red", #~ data = ds) #~ ax.set_title('All terms')
lib_dir='_tmp' gss = [ 'GO_Biological_Process_2018', #~ 'GO_Cellular_Component_2018', #~ 'GO_Molecular_Function_2018', 'KEGG_2016', #'Reactome_2016' ] #1= Enrich all terms enr = erl.enrich_gs(gl,gss, path_lib=lib_dir) #2= Filter terms by p-Val enr = enr[enr['p-Val']<0.005] #3= Make claster by kappa coeff enr = erl.cluster(set(gl), enr, deep=2) #3-1= Filter top clusters top_clusters = 18 enr = enr[enr['cluster']<top_clusters] #4= Make clustered geneset gs_clust,nt_cl = erl.cluster_genset(enr) #5= Enrich clustered geneset
## Enrichment analysis # List of gene sets gss = [ #~ 'GO_Biological_Process_2018', #~ 'GO_Cellular_Component_2018', #~ 'GO_Molecular_Function_2018', #~ 'KEGG_2016', #~ 'Reactome_2016', 'Cancer_Cell_Line_Encyclopedia', 'NCI-60_Cancer_Cell_Lines', 'MSigDB_Computational', 'MSigDB_Oncogenic_Signatures' ] enr = erl.enrich_gs(gl, gss, path_lib='../data/EnrichrLibs') # For futher analysis it is convinient to filter terms by p-value enr_a = enr[enr['p-Val'] < 0.05] G, enr_out, nt = erl.make_graph(gl, enr_a, kappa=0.4) enr_out.sort_values('cluster', axis=0, inplace=True) # --- Plot --- cm = 'tab20' erl.draw_graph(G, spring=500, pval_prcnt=0.7, palette=cm) ds = enr_out.head(40) f, ax = plt.subplots(figsize=(12, 12))
setR = set(list(gt.iloc[:, 0])) gss = [ #~ 'Pierre_gene_sets.gmt', 'GO_Biological_Process_2018.gmt', 'GO_Cellular_Component_2018.gmt', 'GO_Molecular_Function_2018.gmt', 'KEGG_2016.gmt', 'Reactome_2016.gmt' ] enrr = pd.DataFrame() for gs in gss: pl = erl.read_gmt('EnrichrLibs/' + gs) enr = erl.enrich(setR, pl) print(len(enr)) enrr = pd.concat([enrr, enr]) #~ ds = enr.head(20) #~ f, ax = plt.subplots(figsize=(16.5, 6.5)) #~ sns.barplot(y=ds.index, #~ x='-log10(p-Val)', #~ ax = ax, #~ color="Red", #~ data = ds) #~ ax.set_title(gs.split(".")[0]) # p-Value filter enrr = enrr[enrr['p-Val'] < 0.005]
'MSigDB_Computational', 'MSigDB_Oncogenic_Signatures', #'Mouse_Gene_Atlas', 'NCI-60_Cancer_Cell_Lines', #'NCI-Nature_2016', #'OMIM_Disease', 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO', 'Reactome_2016', #'WikiPathways_2016' ] # == UP analysis upTLX_list = [x.upper() for x in upTLX_list] enrUP_tlx = erl.enrich_gs(upTLX_list, gss) # --- Plot --- enrUP_tlx.sort_values('p-Val', axis=0, inplace=True) ds = enrUP_tlx.head(20) f, ax = plt.subplots() sns.barplot(y=ds.index, x='-log10(p-Val)', ax=ax, color="Red", data=ds) ax.set_title('UP_dRP_Tlx_peaks') if SAVE: plt.savefig(pp, format='pdf') # == DN analysis dnTLX_list = [x.upper() for x in dnTLX_list]
'ChEA_2016', 'Disease_Perturbations_from_GEO_down', 'Disease_Perturbations_from_GEO_up', 'Disease_Signatures_from_GEO_down_2014', 'Disease_Signatures_from_GEO_up_2014', 'GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'GO_Molecular_Function_2018', 'Human_Phenotype_Ontology', 'KEGG_2016', 'MSigDB_Computational', 'MSigDB_Oncogenic_Signatures', 'Mouse_Gene_Atlas', 'NCI-60_Cancer_Cell_Lines', 'NCI-Nature_2016', #'OMIM_Disease', 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO', 'Reactome_2016', 'WikiPathways_2016' ] enrDD = erl.enrich_gs(gup, gss) enrDD.sort_values('p-Val', axis=0, inplace=True) ds = enrDD.head(20) f, ax = plt.subplots(figsize=(16.5, 6.5)) sns.barplot(y=ds.index, x='-log10(p-Val)', ax=ax, color="Red", data=ds) ax.set_title('All terms') plt.show()
# my libs import EnrichRLib as erl # Project settings from os.path import join WORKDIR = '/home/sergio/Res_CIML/TLX3_project' SCRIPTS = join(WORKDIR,'scripts') DATADIR = join(WORKDIR,'data') WGS = join(DATADIR,'tracks/WGS-WES/Germline') RP = join(DATADIR,'tracks/MARGE/relativeRP/bam_input') # Load genes body regions genes = pb.BedTool(join(DATADIR,'tracks/annot_tracks/references/mm9/mm9.refGene.bed')) tall = erl.read_gmt(join(DATADIR,'gene_lists/Cancermine/T-ALL.gmt')) # Genes of interests gl = ['BCL11B', 'EZH2', 'RUNX1', 'FBXW7', 'FOS', 'ETV6', 'RPL5', 'RB1', 'GATA3', 'LEF1', 'TET1', 'PTEN',