def enrichment_analysis(self, library, output): ''' Saves the results of enrichment analysis Attributes: ----------- library - Enrichr library to be used. Recommendations: - 'GO_Molecular_Function_2018' - 'GO_Biological_Process_2018' - 'GO_Cellular_Component_2018' for more options check available libraries by typing gseapy.get_library_name() output - directory name where results should be saved ''' libs = gseapy.get_library_name() assert library in libs, "the library is not available, check gseapy.get_library_name() for available options" assert (self.convert == True) or ( self.origID == "symbol" ), "EnrichR accepts only gene names as an input, thus please set 'convert' to True and indicate the original gene ID" genes1_name = [self.mapping[x] for x in self.genes1] genes2_name = [self.mapping[x] for x in self.genes2] all_genes_names = genes1_name + genes2_name gseapy.enrichr(gene_list=all_genes_names, description='pathway', gene_sets=library, cutoff=0.05, outdir=output)
def gsea(genes, description='', out='./', sleeptime=1, sleep=False, gsets=[ 'GO_Biological_Process_2018', 'KEGG_2019_Human', 'WikiPathways_2019_Human' ]): """ genes (list of str): gene symbols description (str): name for enrichment report sleeptime (int): length of wait time between each query (overloading server causes connection to be cut) """ if sleep: for gset in gsets: time.sleep(sleeptime) gseapy.enrichr(gene_list=genes, description=description, gene_sets=gset, outdir=out) else: gseapy.enrichr(gene_list=genes, description=description, gene_sets=gsets, outdir=out) return
def enrichr_validation(gene_list, gene_rank=None, outdir="validation_results", gene_sets='KEGG_2016'): """ Perform the enrichr tool (http://amp.pharm.mssm.edu/Enrichr/) Enrichment of a gene list Args: -gene_list (list): Gene list to analyze -gene_rank (list): Ranking of the genes (according to a scoring function) -outdir (str): Location to save the files -gene_sets (str): Gene set to use for the enrichment """ if gene_rank == None: enr = gp.enrichr(gene_list=gene_list, description='pathway', gene_sets='KEGG_2016', outdir='test', cutoff=0.05, format='png') else: assert type(gene_rank) == list, "please provide gene_rank as a list" rnk = pd.DataFrame(np.array([gene_list, gene_rank]).T, columns=['gene', 'score']) enr = gp.enrichr(gene_list=rnk, description='pathway', gene_sets='KEGG_2016', outdir=outdir, cutoff=0.05, format='png') result = enr.res2d[enr.res2d["Adjusted P-value"] < pvalue] return result
def res(x_one, y_one, top): array_one = [] enr_x_one = None try: enr_x_one = gp.enrichr(gene_list=x_one, gene_sets=lib, organism='Human', cutoff=0.05).results[[ 'Term', 'P-value' ]].head(top).values.tolist() except Exception: pass if enr_x_one is not None and len(enr_x_one) > 0: enr_y_one = None try: enr_y_one = gp.enrichr(gene_list=y_one, gene_sets=lib, organism='Human', cutoff=1.0).results[['Term', 'P-value']] except Exception: pass if enr_y_one is not None: for term in enr_x_one: pair = enr_y_one.loc[enr_y_one['Term'] == term[0]] if pair is not None and pair.shape[0] > 0: pair = pair.iloc[0].values.tolist() array_one.append((term[1], pair[1])) return array_one
def enrichr(gene_list, description, out_dir, scan=None, max_terms=10, figsize=(12, 6), run_main=False): ''' Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list. Uses enrichr. Inputs ------ gene_list: list of genes to perform enrichment on description: string description for title out_dir: output director scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats) max_terms: limit return plot to this max load: load results figsize: change fig size Returns ------- None ''' out_dir = make_folder(out_dir) testscan = { 'KEGG': 'KEGG_2016', 'GO_biological_process': 'GO_Biological_Process_2017b', 'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ChEA': 'ChEA_2016', 'OMIM_Disease': 'OMIM_Disease' } if isinstance(scan, dict): testscan = {**testscan, **scan} for nick, name in testscan.items(): gseapy.enrichr(gene_list=gene_list, figsize=figsize, top_term=max_terms, description=f'{description}_{nick}', gene_sets=name, outdir=out_dir, format='png') out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png', f'Enrichr: {nick} for {description}', run_main=run_main) out_list = pd.DataFrame({'Gene Name': gene_list}, index=range(len(gene_list))) out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
def adj_enrich_score(ranks, gene_set_file, gene_sets, cutoff=.5): clusters = ranks.columns.tolist() # Just to get pathway names nes_df = pd.DataFrame(index=gene_sets) for cluster in clusters: print("computing for cluster %s" % cluster) genes = ranks.index[(ranks[cluster] > cutoff).tolist()] #print(genes) print(len(genes)) enr = gp.enrichr( gene_list=genes.tolist(), # or gene_list=glist description='test_name', gene_sets=gene_set_file, outdir='../test/enrichr_kegg', cutoff=1 # test dataset, use lower value of range(0,1) ) try: enr.results.index = enr.results["Term"] #print(enr.results.head()) #print(nes_df.head()) nes_df[cluster] = enr.results["Adjusted P-value"] except KeyError: print(enr.results.columns) #print(nes_df.head()) #print("number of positive nes scores %d" % (nes_df[cluster] > 0).sum()) return nes_df
def save_enrichment(x): lib = gp.get_library_name('Human') with open('gensets.txt', 'w') as f: for item in range(len(lib)): f.write("%s %s\n" % (item, lib[item])) # lib = lib[49: 54] lib = lib[53] files = [(1, x+"/gcn-hom-hom.csv"), (2, x+"/gcn-hom-onto.csv"), (3, x+"/gcn-onto-onto.csv"), (4, x+"/gae-hom-hom.csv"), (5, x+"/gae-hom-onto.csv"), (6, x+"/gae-onto-onto.csv")] df = pd.DataFrame() writer = pd.ExcelWriter('enrich-cluster/full-results.xlsx') for key, file in files: print(file) cluster_data = read_file_2(file) for i in cluster_data: try: enr = gp.enrichr(gene_list=list(cluster_data[i][2]), gene_sets=lib, organism='Human', cutoff=0.05).results except: pass enr['model'] = key enr['cluster'] = i df = df.append(enr) df = df[(df['P-value'] < 0.05)] df.to_excel(writer, sheet_name="sheet1") writer.save()
def enrichr_go_bp(symbols=None, cutoff=0.05): dummy_directory = 'biopipe-enrichr-dummy' try: enrichr_result = gseapy.enrichr( gene_list=symbols, gene_sets='GO_Biological_Process_2017b', outdir=dummy_directory, no_plot=True) except Exception: print( 'An error occurred during running enrichr. Please try again later.' ) sys.exit(1) finally: shutil.rmtree(dummy_directory) result_dataframe = enrichr_result.res2d p_value_filtered_result = result_dataframe[ result_dataframe['Adjusted P-value'] < cutoff] transformed_p_values = [ -np.log10(p) for p in p_value_filtered_result['Adjusted P-value'].values ] terms = p_value_filtered_result['Term'].values data = list(zip(terms, transformed_p_values))[:10] terminal_bar_chart(data, title='%s [-log10(p)]' % enrichr_result.gene_sets, sort=True)
def __init__(self, data:H5COUNTS, path="data/interim/", threshold=0.05, gene_sets=['GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'], tumor_ids=[1, 2, 3, 4, 5, 6, 7, 8]): self.gsea_table = pd.DataFrame() self.data = data for tumor_id in tumor_ids: de_genes_tumor_df = pd.read_csv(path+"MK_genes_TUMOR{}.csv".format(tumor_id)) de_genes_by_cluster = de_genes_tumor_df.groupby("cluster")["gene"].apply(lambda x: "|".join(x.unique())) tumor_name = data.id2tumor[tumor_id] print("Running GSEA for tumor", tumor_name) for cluster in de_genes_by_cluster.index: DE_gene_list = de_genes_by_cluster[cluster].split("|") tumor_cluster = tumor_name + "_" + str(cluster) enr = gp.enrichr(gene_list=DE_gene_list, gene_sets=gene_sets, no_plot=True, cutoff=0.05 # test dataset, use lower value from range(0,1) ) if threshold: enr.results = enr.results[enr.results["Adjusted P-value"] < threshold] enr_results = enr.results.set_index("Term") for geneset in enr_results.index: self.gsea_table.loc[geneset, tumor_cluster] = enr_results.loc[geneset, "Adjusted P-value"] self.gsea_table = self.gsea_table.T self.gsea_table.index = pd.MultiIndex.from_tuples(self.gsea_table.index.str.split("_", expand=True), names=["tumor", "cluster"])
def _enrichr(self, category, background=None, verbose=True): if background is None: background = self.background if isinstance(category, list): gene_list = category else: assert category in ['up', 'down', 'all'] gene_list = list(self.rnadiff.gene_lists[category]) if self.mapper is not None: logger.info("Input gene list of {} ids".format(len(gene_list))) #gene_list = [x.replace("gene:", "") for x in gene_list] identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates( ).values logger.info("Mapped gene list of {} ids".format(len(identifiers))) gene_list = list(identifiers) enr = gseapy.enrichr(gene_list=gene_list, gene_sets=self.gene_sets, verbose=verbose, background=background, outdir="test", no_plot=True) return enr
def save_enrichment_set(): lib = gp.get_library_name('Human') lib = lib[53] files = [("gcn-hom-hom", "enrich/gcn-hom-hom.csv"), ("gcn-hom-onto", "enrich/gcn-hom-onto.csv"), ("gcn-onto-onto", "enrich/gcn-onto-onto.csv"), ("gae-hom-hom", "enrich/gae-hom-hom.csv"), ("gae-hom-onto", "enrich/gae-hom-onto.csv"), ("gae-onto-onto", "enrich/gae-onto-onto.csv")] enrich_set = {} for key, file in files: print(file) cluster_data = read_file(file) for i in cluster_data: print(len(cluster_data[i][2])) try: enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results name = key + "-" + str(i) term = enr['Term'].to_list() enrich_set[name] = term # print(i) print(enr) except: pass write_file("enrich-cluster/full_result_dic.csv", enrich_set)
def vec_enrich(vec, gene_ids, quantile, gene_sets): ind = np.quantile(vec, quantile) > vec print("... {} features selected...".format(sum(ind))) genes = gene_ids[ind] # remove ens id version genes = [re.sub("\\..*$", "", g) for g in genes] print("Mapping to gene names...") # map ens ids to gene symbols bm = Biomart() bm_result = bm.query( dataset="hsapiens_gene_ensembl", attributes=[ "ensembl_gene_id", "external_gene_name", "entrezgene_id", "go_id", ], filters={"ensembl_gene_id": genes}, ) gene_symbols = list(bm_result["external_gene_name"].unique()) print("Calculating enrichment...") enr = gp.enrichr( gene_list=gene_symbols, gene_sets=gene_sets, organism="Human", cutoff=0.05, ) return enr
def enrichr_test(direction="pos", significance=True): genes = np.array(rnk["gene"]) pvals = np.array(rnk["pval"]) fcs = np.array(rnk["fc"]) if direction == "pos": if significance: hits = set(genes[np.logical_and(fcs > 0, pvals < alpha)]) else: hits = set(genes[:top]) else: if significance: hits = set(genes[np.logical_and(fcs < 0, pvals < alpha)]) else: hits = set(genes[-top:]) if significance: sur = "sign05" else: sur = "top100" outpath = OUTPATH + "/enrichr_%s_%s/" % (direction, sur) if not os.path.exists(outpath): os.mkdir(outpath) enr = gseapy.enrichr( gene_list=list(hits), description="%s_%s_%s" % (filename, direction, sur), gene_sets=gene_sets, outdir=outpath, cutoff=0.5 # only used for plotting. )
def get_ontology_df( topic, cutoff=0.05, threshhold=5e-1, gene_sets=[ 'GO_Molecular_Function_2018', 'GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'Human_Phenotype_Ontology', 'GTEx_Tissue_Sample_Gene_Expression_Profiles_up', 'GTEx_Tissue_Sample_Gene_Expression_Profiles_down', 'Tissue_Protein_Expression_from_Human_Proteome_Map', 'KEGG_2019_Human', 'NCI-60_Cancer_Cell_Lines' ], background=None): """ :param topic: list of genes :param background: enrichment test background :param cutoff: Enrichments cutoff :param threshhold: threshold on Adjusted P-value :return: """ sets = ','.join(gene_sets) if background is None: background = 'hsapiens_gene_ensembl' topic = [g for g in topic if str(g) != 'nan'] gene_ontology = gs.enrichr(list(topic), gene_sets=sets, cutoff=cutoff, background=background).results return gene_ontology[gene_ontology['Adjusted P-value'] < threshhold][[ 'Term', 'Adjusted P-value', 'Gene_set' ]]
def perform_GO_enrichment_analysis(inputGenes, geneDescription, threshold): targetGeneSet = "GO_Biological_Process_2015" #"KEGG_2016","Reactome_2013","GO_Molecular_Function_2015","WikiPathways_2013" if os.path.exists('./gene_set_enrichment_analysis') == False: os.makedirs('./gene_set_enrichment_analysis') enr = gp.enrichr(gene_list=inputGenes, description=geneDescription, gene_sets=targetGeneSet, outdir='./gene_set_enrichment_analysis', cutoff=threshold) enr.res2d.head()
def corr_rep_gene(adata, rep, prefix='', dims=[0,1], offset=1, layer=None, thresh=0.5, out='./', \ gsets=['GO_Biological_Process_2018', 'KEGG_2019_Human', 'WikiPathways_2019_Human']): """ Use gseapy.get_library_name() to see more gene set option """ for i in dims: df = corr_comp_gene(adata, rep, i, offset=offset, layer=layer) df.to_csv('{}{}_{}{}_corr_gene.csv'.format(out, rep, prefix, i + 1)) df.sort_values('R', ascending=False).to_csv( '{}{}_{}{}_corr_gene_sorted.csv'.format(out, rep, prefix, i + 1)) df_pos = df.loc[df['R'] > thresh, :] df_neg = df.loc[df['R'] < -thresh, :] for sign, df in zip(['pos', 'neg'], [df_pos, df_neg]): if len(df) > 0: gseapy.enrichr(gene_list=list(df.index), gene_sets=gsets, outdir='{}{}{}_{}'.format( out, prefix, i + 1, sign)) return
def calcu_gsea(gene_list, gmt, bg): gene_list = list(gene_list) enr2 = gp.enrichr( gene_list=gene_list, # or gene_list=glist description='test_name', gene_sets=gmt.term_set, background=bg, # or the number of genes, e.g 20000 outdir=None, cutoff=0.5, # only used for testing. verbose=True) return enr2.results
def gsea(homepath): ''' Parameters ---------- `homepath` (str): Path where you want to save all the generated files and folders. Return: ------- None Outputs: -------- Generate a directory names enrichr within home directory and two plot of gene enrichement analysis using the selected genes from panclassif ''' warnings.filterwarnings("ignore") # Directory directory = "enrichr" # Parent Directory path parent_dir = homepath # Path path = os.path.join(parent_dir, directory) if not os.path.exists(path): os.mkdir(path) gene = pd.read_csv(homepath + "/std_npy/unique_genes_with_frequency.csv", header=None) gl = [] for g in range(len(gene)): gl.append(gene[0][g]) enr = gs.enrichr(gene_list=gl, description='Disease', gene_sets='DisGeNET', outdir=homepath + '/enrichr') # simple plotting function from gseapy.plot import barplot, dotplot # to save your figure, make sure that ``ofname`` is not None barplot(enr.res2d, title='DisGeNET', cutoff=0.2, ofname=homepath + '/enrichr/DisGeNET_barplot.png') dotplot(enr.res2d, title='DisGeNET', cmap='viridis_r', cutoff=0.2, ofname=homepath + '/enrichr/DisGeNET_dotplot.png')
def enrichment_GO(gene_list, go_mode='Bio', organism='Human', description='test_name', outdir='enrichment_go', cutoff=0.5): ''' Gene enrichment analysis of GO Parameters ---------- gene_list:list The gene set to be enrichment analyzed go_mode:str The module of GO include:'Bio','Cell','Mole' organism:str Select from (human, mouse, yeast, fly, fish, worm) description:str The title of enrichment outdir:str The savedir of enrichment cutoff:float Show enriched terms which Adjusted P-value < cutoff. Returns ---------- result:pandas.DataFrame stores your last query ''' if (go_mode == 'Bio'): geneset = 'GO_Biological_Process_2018' if (go_mode == 'Cell'): geneset = 'GO_Cellular_Component_2018' if (go_mode == 'Mole'): geneset = 'GO_Molecular_Function_2018' enr = gp.enrichr( gene_list=gene_list, gene_sets=geneset, organism= organism, # don't forget to set organism to the one you desired! e.g. Yeast description=description, outdir=outdir, # no_plot=True, cutoff=cutoff # test dataset, use lower value from range(0,1) ) subp = dotplot(enr.res2d, title=description, cmap='seismic') print(subp) return enr.res2d
def main(): usage = "" # TODO parser = OptionParser(usage=usage) parser.add_option("-o", "--out_file", help="OUTPUT file") (options, args) = parser.parse_args() de_genes_f = args[0] #expr_fs = args[2].split(',') out_f = options.out_file # Get all the DE genes de_genes_df = pd.read_csv(de_genes_f, sep='\t', index_col=0) de_genes = de_genes_df.index # Perform gene set enrichment db_to_gene_sets = {} for db_name, db in GENE_SETS.items(): enr = gp.enrichr( gene_list=[x.strip() for x in de_genes], #gene_list=[x.strip() for x in de_genes] gene_sets=[db], background=19463, no_plot=True, cutoff=0.05 # test dataset, use lower value from range(0,1) ) enr.results = enr.results[ enr.results["Adjusted P-value"] < GSEA_THRESH] sig_terms = { str(row[0]): float(row[1]) for row_i, row in enr.results[['Term', 'Adjusted P-value' ]].iterrows() } db_to_gene_sets[db_name] = sig_terms # Create final dataframe da = [] for db, gene_set_to_pval in db_to_gene_sets.items(): for gene_set, pval in gene_set_to_pval.items(): da.append((db, gene_set, pval)) df = pd.DataFrame(data=da, columns=['collection', 'gene_set', 'adjusted_p_value']) df = df.sort_values(by='adjusted_p_value', axis=0) print('{} total enriched gene sets.'.format(len(df))) # Write output print('Writing to {}.'.format(out_f)) df.to_csv(out_f, index=False, sep='\t') print('done')
def enrichr(self, gene_dict, gene_set, key, reg): """Perform enrichr analysis on a gene dictionary of sample group : enriched gene list, a GO term library of gene set, and a reg varible that is either 'upreg' or 'downreg'""" # check if gene list is empty if not gene_dict[key]: pass else: # run enrichr - if there are no genes enriched with the cutoff level, it will not generate an output enr = gp.enrichr(gene_list=gene_dict[key], gene_sets=[gene_set], organism='Human', description=key + "_" + reg, outdir=self.input_dir + 'enrichr/' + key + "/", cutoff=0.1) enr_df = enr.results.copy() return enr_df
def run_enrichr(gene_list, gene_sets): ''' gene_list: List List containing genes names used for the analysis gene_sets: List List of enrichr gene libaries to use enr: Enrichr object Analysis output, use "enr.results" to print table of results ''' enr = gp.enrichr(gene_list=gene_list, description='pathway', gene_sets=gene_sets, organism='Human', cutoff=0.5) return enr
def enrichment_KEGG(gene_list, gene_sets=['KEGG_2019_Human'], organism='Human', description='test_name', outdir='enrichment_kegg', cutoff=0.5): ''' Gene enrichment analysis of KEGG Parameters ---------- gene_list:list The gene set to be enrichment analyzed gene_sets:list The gene_set of enrichr library Input Enrichr Libraries (https://maayanlab.cloud/Enrichr/#stats) organism:str Select from (human, mouse, yeast, fly, fish, worm) description:str The title of enrichment outdir:str The savedir of enrichment cutoff:float Show enriched terms which Adjusted P-value < cutoff. Returns ---------- res:pandas.DataFrame stores your last query ''' enr = gp.enrichr( gene_list=gene_list, gene_sets=gene_sets, organism= organism, # don't forget to set organism to the one you desired! e.g. Yeast description=description, outdir=outdir, # no_plot=True, cutoff=cutoff # test dataset, use lower value from range(0,1) ) subp = dotplot(enr.res2d, title=description, cmap='seismic') print(subp) return enr.res2d
def save_top4_csvs(): for i in files: file_name = "enrich/" + i + ".csv" file = read_file(file_name) temp = {} for j in file: gene_list = list(file[j][2]) enr_x_one = None try: enr_x_one = gp.enrichr( gene_list=gene_list, gene_sets=lib, organism='Human', cutoff=0.05).results.head(10)['Term'].tolist() temp[j] = set(enr_x_one) except: pass write_file("top_4_bio_process/" + i + ".csv", temp)
def enrich_tissue(tissue_name, last_num): communities = pickle.load( open("results/louvain_modules_" + tissue_name + ".pkl", "rb")) corr_mat = pd.read_pickle("data/corr_" + tissue_name + ".pkl") community_id = 1 for community in np.unique(communities[0]): common = np.array(corr_mat.columns)[communities[0] == community] if len(common) <= 3: continue # print(community_id) print("For community", community_id, "(len: " + str(len(common)) + ")...") if community_id < last_num: community_id += 1 continue enr = gp.enrichr(gene_list=list(common.astype('<U3')), organism='human', description=tissue_name + "_" + str(community_id), gene_sets='Reactome_2016', cutoff=0.05, outdir='results/EnrichClass') if enr.results.shape[0] > 0: enr.results = enr.results[enr.results['Adjusted P-value'] < 0.05] if enr.results.shape[0] > 0: enr.results.to_csv("results/enrichment/" + tissue_name + "_" + str(community_id) + "_" + str(len(common)) + ".csv") print("Enriched!") community_id += 1 sleep( 50 ) # just to go easy on the Enrich API... (constantly getting errors after a while)
def _get_top_enrichr_term(gene_sets, libraries=[ # 'GO_Biological_Process_2018', # 'GO_Cellular_Component_2018', # 'GO_Molecular_Function_2018', 'KEGG_2019_Human', ], cutoff=0.01, top_k=1): results = [] for gene_set in gene_sets: try: enr = gp.enrichr(gene_list=gene_set, gene_sets=libraries, cutoff=cutoff, no_plot=True, verbose=False, ) if enr.results.shape[0] > 0: results.append(enr.results.sort_values(by="Adjusted P-value").head_node_type(top_k)) except Exception: pass results = [row for row in results if row is not None] if len(results) > 0: return pd.concat(results) else: return None
import gseapy as gp import pandas as pd import RNA_expression_processing as rn import matplotlib.pyplot as plt path = 'tracks/MARGE/relativeRP/' dt = pd.read_csv(path + 'DN_RegNetwork_TLX3.csv') gl = list(dt['gene_name']) gs = 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO' rs = gp.enrichr(gene_list=gl, gene_sets=gs) rn.barplot(rs.res2d, ttl=gs) plt.show()
tss = set(tlx_tss_3kb_gn) enh = set(tlx_chrhmm_gn) #~ from matplotlib_venn import venn2 #~ venn2([tss,enh], set_labels = ('Tlx3 in Enhancer', 'Tlx3 in Promoter')) tss_and_enh = tss & enh tss_or_enh = tss | enh tss_notin_enh = tss - enh enh_notin_tss = enh - tss diff_enh_tss = enh ^ tss # === Run GSEApy Enrichr gp.enrichr(gene_list=list(tss_and_enh), description='tss_and_enh', gene_sets='GO_Biological_Process_2017b', outdir='gene_lists/tlx_enh_tss3k/' + 'tss_and_enh_GO_BP_2017b') gp.enrichr(gene_list=list(tss_or_enh), description='tss_or_enh', gene_sets='GO_Biological_Process_2017b', outdir='gene_lists/tlx_enh_tss3k/' + 'tss_or_enh_GO_BP_2017b') gp.enrichr(gene_list=list(tss_notin_enh), description='tss_notin_enh', gene_sets='GO_Biological_Process_2017b', outdir='gene_lists/tlx_enh_tss3k/' + 'tss_notin_enh_GO_BP_2017b') gp.enrichr(gene_list=list(enh_notin_tss), description='enh_notin_tss', gene_sets='GO_Biological_Process_2017b',
for x in pd.DataFrame.from_records(r).columns: print("group :"+x, end = "\r") # rank gene by importance for clusters glist = pd.DataFrame.from_records(r)[x].tolist() bm = Biomart() if not os.path.exists("test"): os.makedirs("test") results = bm.query(dataset='hsapiens_gene_ensembl', attributes=['external_gene_name', 'go_id'], filters={'hgnc_symbol': glist}, # save output file filename="test/query_"+x+".results.txt") enr = gp.enrichr(gene_list=glist, description='test_name', gene_sets=['KEGG_2016'], outdir="test/enrichr_kegg_group"+x, cutoff=0.5 # test dataset, use lower value from range(0,1) ) # to save your figure, make sure that ``ofname`` is not None #barplot(enr.res2d,title='',ofname="test/enrichr_kegg_group"+x+"/bar_plot.pdf") dotplot(enr.res2d, title='',ofname="test/enrichr_kegg_group"+x+"/dot_plot.pdf") #pd.DataFrame.from_records(adata.uns['rank_genes_groups']['scores'])[x] rnk= pd.concat([pd.DataFrame.from_records(r)[x],pd.DataFrame.from_records(adata.uns['rank_genes_groups']['scores'])[x]],axis=1) rnk.columns=[0,1] pre_res = gp.prerank(rnk=rnk, gene_sets='KEGG_2016', processes=4,
def main(): usage = "" # TODO parser = OptionParser(usage=usage) parser.add_option("-t", "--de_table_f", help="DE table file") parser.add_option("-c", "--de_table_col", help="DE table column to filter") parser.add_option("-o", "--out_file", help="Output file") parser.add_option("-k", "--out_kept", help="Output kept genes file") parser.add_option("-v", "--out_venn_f", help="Venn diagram file") (options, args) = parser.parse_args() de_genes_f = args[0] ec_f = args[1] out_f = options.out_file out_kept_f = options.out_kept out_venn_f = options.out_venn_f # Get all the DE genes with open(de_genes_f, 'r') as f: de_genes = [l.strip() for l in f] print('{} total DE genes'.format(len(de_genes))) # Map each gene to its fold change and filter # by fold change if FILTER_BY_FOLD: ec_df = pd.read_csv(ec_f, sep='\t', index_col=0) gene_to_fc = {gene: ec_df.loc[gene]['FC'] for gene in ec_df.index} filtered_genes = [ gene for gene in gene_to_fc if gene_to_fc[gene] > FC_THRESH or gene_to_fc[gene] < (1 / FC_THRESH) ] print('{}/{} remain after filtering by fold-change.'.format( len(filtered_genes), len(gene_to_fc))) if len(filtered_genes) == 0: with open(out_f, 'w') as f: f.write('collection\tgene_set\tadjusted_p_value') return # Filter batch effect if specified in options if options.de_table_f: de_table_f = options.de_table_f de_table_col = options.de_table_col de_table_df = pd.read_csv(de_table_f, sep='\t', index_col=0) batch_effect_genes = set(de_table_df.loc[de_table_df[de_table_col] == 1][de_table_col].index) # Output the removed genes present_batch_effect_genes = batch_effect_genes & set(filtered_genes) print('Removed {} genes: {}'.format(len(present_batch_effect_genes), present_batch_effect_genes)) filtered_genes = sorted(set(filtered_genes) - batch_effect_genes) with open(out_kept_f, 'w') as f: f.write('\n'.join(sorted(filtered_genes))) print('{}/{} remain after filtering by batch-effect:'.format( len(filtered_genes), len(gene_to_fc))) print(filtered_genes) if len(filtered_genes) == 0: with open(out_f, 'w') as f: f.write('collection\tgene_set\tadjusted_p_value') return # Draw Venn diagram if 'Up' in de_genes_f: title = r'DE genes $\bf{higher}$ in COVID-19 ICU patients' else: title = r'DE genes $\bf{lower}$ in COVID-19 ICU patients' fig, ax = plt.subplots(1, 1, figsize=(4.5, 3.5)) venn2(subsets=( len(filtered_genes), len(batch_effect_genes), len(present_batch_effect_genes), ), set_labels=('COVID-19 ICU\nvs. sepsis ARDS', 'non-ICU\nvs. sepsis non-ARDS'), set_colors=('b', 'y'), alpha=0.5, ax=ax) plt.gca().set_title(title, fontsize=14) plt.tight_layout() plt.savefig(options.out_venn_f, format='pdf') # Perform gene set enrichment db_to_gene_sets = {} for db_name, db in GENE_SETS.items(): enr = gp.enrichr( gene_list=[x.strip() for x in filtered_genes], gene_sets=[db], background=19463, no_plot=True, cutoff=0.05 # test dataset, use lower value from range(0,1) ) enr.results = enr.results[ enr.results["Adjusted P-value"] < GSEA_THRESH] sig_terms = { str(row[0]): float(row[1]) for row_i, row in enr.results[['Term', 'Adjusted P-value' ]].iterrows() } db_to_gene_sets[db_name] = sig_terms # Create final dataframe da = [] for db, gene_set_to_pval in db_to_gene_sets.items(): for gene_set, pval in gene_set_to_pval.items(): da.append((db, gene_set, pval)) df = pd.DataFrame(data=da, columns=['collection', 'gene_set', 'adjusted_p_value']) df = df.sort_values(by='adjusted_p_value', axis=0) print('{} total enriched gene sets.'.format(len(df))) # Write output print('Writing to {}.'.format(out_f)) df.to_csv(out_f, index=False, sep='\t') print('done')