def get_gene_list(self, samples_stat): for sample in samples_stat: gene = samples_stat[sample]['gene'] if len(gene) == 0: continue else: gp = GProfiler(user_agent='ExampleTool', return_dataframe=True) df = gp.profile(organism='hsapiens', query=gene) go = df[df['native'].str.contains('GO')] go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(go, sample, 'GO') kegg = df[df['native'].str.contains('KEGG')] kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') self.plot_go(kegg, sample, 'KEGG') df = gp.convert(organism='hsapiens', query=gene, target_namespace='ENTREZGENE_ACC') df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format( module=self.module, sample=sample), header=True, index=False, sep=',') with open( '{module}/{sample}/gene_list.txt'.format( module=self.module, sample=sample), 'wt') as f: f.write('\n'.join(gene))
def run_gprofiler(inputfile, theargs): """ todo :param inputfile: :return: """ genes = read_inputfile(inputfile) gp = GProfiler(return_dataframe=True) genes = genes.strip(',').strip('\n').split(',') df_result = gp.profile(query=genes, organism=theargs.organism, user_threshold=theargs.maxpval) if df_result.shape[0] == 0: sys.stderr.write('No terms found\n') return 0 df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] + 1.0 / df_result['recall'] - 1) df_result.sort_values(['Jaccard', 'p_value'], ascending=[False, True], inplace=True) df_result.reset_index(drop=True, inplace=True) top_hit = df_result['name'][0] sys.stdout.write(top_hit) return 0
def run_gprofiler(inputfile, theargs): """ todo :param inputfile: :return: """ genes = read_inputfile(inputfile) gp = GProfiler(return_dataframe=True) genes = genes.strip(',').strip('\n').split(',') df_result = gp.profile(query=genes, organism=theargs.organism, user_threshold=theargs.maxpval, no_evidences=False) if df_result.shape[0] == 0: sys.stderr.write('No terms found\n') return 0 df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] + 1.0 / df_result['recall'] - 1) df_result.sort_values(['Jaccard', 'p_value'], ascending=[False, True], inplace=True) df_result.reset_index(drop=True, inplace=True) theres = { 'name': df_result['name'][0], 'source': df_result['source'][0], 'p_value': df_result['p_value'][0], 'description': df_result['description'][0], 'intersections': df_result['intersections'][0] } json.dump(theres, sys.stdout) sys.stdout.flush() return 0
def run_gProfiler(comp, org): gp = GProfiler( return_dataframe=True) #return pandas dataframe or plain python structures #gp = GProfiler(user_agent = 'lisa' ) list_id = [] for name in list(set(comp["composite"])): i_d = name.split('|')[1] list_id.append(i_d) res = gp.profile(organism=org,domain_scope = "annotated", sources = ["GO", "KEGG", "REACTOME"], #exemple org : hsapiens query=list_id) return res
def pathway_enrich_genes(genes, databases): gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', query=genes, #"contains the list of enriched genes" no_evidences=False) cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[1,2,5,10,13]] pd.set_option("display.max_colwidth", 800) return cluster_enrichment_results.iloc[:10,:]
def pathway_enrich(genes, databases): gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', domain_scope ='annotated', #background= 10000, query= genes) #"contains the list of enriched genes" cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]] pd.set_option("display.max_colwidth", 800) return cluster_enrichment_results.iloc[:10,:]
def Functional_profiling(gene_list, organism='hsapiens', sources=[ "GO:MF", "GO:CC", "GO:BP", "KEGG", "REAC", "WP", "TF", "MIRNA", "HPA", "CORUM", "HP" ], user_threshold=0.05): gp = GProfiler(return_dataframe=True) gp_result_df = gp.profile(query=gene_list, organism=organism, user_threshold=user_threshold, no_iea=True, sources=sources) return gp_result_df
def execute(self): d = pd.read_excel(self.args["<markers_spreadsheet>"], sheet_name=None) output_spreadsheet = self.args['<output_spreadsheet>'] organism = self.args["--organism"] enrichment_threshold = float(self.args["--enrichment_threshold"]) max_genes = int(self.args['--max_genes']) from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) query = {} for key in d.keys(): features = d[key]['feature'].values.tolist() query[key] = features[0:max_genes] result = gp.profile(organism=organism, query=query, user_threshold=enrichment_threshold) result.to_excel(output_spreadsheet, index=False)
def pathway_enrich_plot(genes, databases, title, background_genes, name_output, save: bool = False): """A function to plot the signature enrichment as a bargraph. # Inputs: # genes - list of genes to be probed # databases - which databases to query, more information can be found here: https://biit.cs.ut.ee/gprofiler/page/apis # title - title for figure # background_genes - all the # save - if I want to save the the figure # """ #Interpretation of differentially expressed genes in cluster 0 cells - g:profiler gp = GProfiler(return_dataframe=True, user_agent='g:GOSt') cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05, significance_threshold_method='fdr', background= background_genes, query=genes) #"contains the list of enriched genes" cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]] # made new column with negative log p-value cluster_enrichment_results['-log10_p_value'] = cluster_enrichment_results['p_value'].map(lambda x: -math.log(x,10)) if 'REAC:0000000' in cluster_enrichment_results.index.tolist(): cluster_enrichment_results = cluster_enrichment_results.drop(labels='REAC:0000000', axis=0) plt.rcdefaults() fig, ax = plt.subplots() cluster_name = cluster_enrichment_results['name'].head(10) y_pos = np.arange(len(cluster_name)) enrichment_value = cluster_enrichment_results['-log10_p_value'].head(10) ax.barh(y_pos, enrichment_value, align='center', color='black') ax.set_yticks(y_pos) ax.set_yticklabels(cluster_name) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('-log10 p value') ax.set_title(title) if save: plt.savefig(name_output, format='pdf', bbox_inches = "tight") return plt.show()
def enrichment_analysis(markers: Dict[str, Dict[str, pd.DataFrame]], max_genes: int = 100, organism: str = 'hsapiens', enrichment_threshold: float = 0.05) -> pd.DataFrame: """Perform enrichment analysis using gprofiler (https://biit.cs.ut.ee/gprofiler/gost). Parameters ---------- markers: ``Dict[str, Dict[str, pd.DataFrame]`` Output from markers. max_genes: ``int``, optional, default: 100 Maximum number of genes to use in enrichment query organism: ``str``, optional, default: ``hsapiens`` Organism. See https://biit.cs.ut.ee/gprofiler/page/organism-list for full list. enrichment_threshold: ``float``, optional, default: ``0.05`` Include enrichment results with corrected p-value less than this threshold Returns ------- ``pd.DataFrame`` """ start = time.perf_counter() from gprofiler import GProfiler gp = GProfiler(return_dataframe=True) query = {} for cluster in markers.keys(): up_list = markers[cluster]['up'].index.values.tolist() if len(up_list) > 0: query[cluster + '-up'] = up_list[0:max_genes] down_list = markers[cluster]['down'].index.values.tolist() if len(down_list) > 0: query[cluster + '-down'] = down_list[0:max_genes] result = gp.profile(organism=organism, query=query, user_threshold=enrichment_threshold) end = time.perf_counter() logger.info( "Enrichment analysis is finished. Time spent = {:.2f}s.".format(end - start)) return result
def enrich_and_simplify(sets, intersections=True, sources=('GO:BP', ), organism='hsapiens', reduce_limit=0, **revigo_kwds): from gprofiler import GProfiler if not isinstance(sets, dict): sets = list(sets) gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True) gprofiler_kwargs = {'no_evidences': not intersections, 'sources': sources} df = gprofiler.profile(sets, organism=organism, **gprofiler_kwargs) revs = {} if reduce_limit is not None: dfs = [] for q in df['query'].unique(): df_sub = df[df['query'] == q].copy() go = df_sub.native.tolist() pvals = df_sub.p_value.tolist() if len(go) > reduce_limit: r = revigo(go, pvals, **revigo_kwds) revs[q] = r r = r.rename(columns={ 'term_ID': 'native' }).drop(columns='description').assign(query=q) dfs.append(df_sub.merge(r)) else: dfs.append(df.assign(eliminated=0)) df = pd.concat(dfs, axis=0).reset_index(drop=True) return df, revs
def main(args): gp = GProfiler( user_agent='gprofiler_custom_gmt', #optional user agent return_dataframe= True, #return pandas dataframe or plain python structures ) genes = [line.strip() for line in open(args.filename)] if args.gmt is not None: with open(args.gmt) as f: response = requests.post( 'https://biit.cs.ut.ee/gprofiler/api/gost/custom/', json={ 'gmt': f.read(), 'name': args.gmt }) token = get_token_form_response(response) elif args.token is not None: token = args.token else: raise ValuError("Please supply either a token or a gmt file") res = gp.profile(genes, organism=token) res.to_csv(args.output)
community = g.community_infomap() print("Number of Communities:", len(community)) df = pd.DataFrame({'gene': g.vs['name'], 'community': community.membership}) # Order by size of communities valuec = df['community'].value_counts() biggest = valuec.unique()[0] # Corta el valor con mas cuentas values = valuec[(valuec >= biggest) | (valuec >= 10)] # Values of communities filtered order = values.index.tolist() df = df[df['community'].isin(order)] df = df.set_index('community') df = df.loc[order].reset_index() # print("community id - Number of Genes") # print(valuec) #! Enrichment by GO gp = GProfiler(return_dataframe=True) enrich_communities = pd.DataFrame() print("Community id - Number of nodes") for name, group in df.groupby('community', sort=False): print(name, group.shape[0]) s = gp.profile(organism='hsapiens', query=group.gene.tolist()) s['community'] = name enrich_communities = enrich_communities.append(s) # print(enrich_components) #! OUTPUT enrich_communities.to_csv(oname1, sep="\t", index=False) df.to_csv(oname2, sep="\t", index=False)
import gprofiler from gprofiler import GProfiler GProfiler? gp = GProfiler(return_dataframe=True) gp.profile(organism='mmusculus', query=genes) genes genes = """ENSMUSG00000076488 ENSMUSG00000065231 ENSMUSG00000079120 ENSMUSG00000047222 ENSMUSG00000097494 ENSMUSG00000064419 ENSMUSG00000095668 ENSMUSG00000059606""".split() gp.profile(organism='mmusculus', query=genes) import requests def mygprofiler(namelist, organism='mmusculus'): """Run gProfiler using POST api with a json query body Returns a pandas DataFrame with the result""" if type(namelist) is not list: namelist = list(namelist) r = requests.post( url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/', json={ 'organism':organism, 'query': namelist, } ) df = pd.DataFrame(r.json()['result']) return df
def enrich( container: Iterable[str], *, org: str = "hsapiens", gprofiler_kwargs: Mapping[str, Any] = {}, ) -> pd.DataFrame: """\ Get enrichment for DE results. This is a thin convenience wrapper around the very useful gprofiler_. This method dispatches on the first argument, leading to the following two signatures:: enrich(container, ...) enrich(adata: AnnData, group, key: str, ...) Where:: enrich(adata, group, key, ...) = enrich(adata.uns[key]["names"][group], ...) .. _gprofiler: https://pypi.org/project/gprofiler-official/#description Parameters ---------- container Contains genes you'd like to search. adata AnnData object whose group will be looked for. group The group whose genes should be used for enrichment. key Key in `uns` to find group under. {doc_org} gprofiler_kwargs Keyword arguments to pass to `GProfiler.profile`, see gprofiler_. Returns ------- Dataframe of enrichment results. Examples -------- Using `sc.queries.enrich` on a list of genes: >>> import scanpy as sc >>> sc.queries.enrich(['Klf4', 'Pax5', 'Sox2', 'Nanog'], org="hsapiens") Using `sc.queries.enrich` on an :class:`anndata.AnnData` object: >>> pbmcs = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(pbmcs, "bulk_labels") >>> sc.queries.enrich(pbmcs, "CD34+") """ try: from gprofiler import GProfiler except ImportError: raise ImportError( "This method requires the `gprofiler-official` module to be installed." ) gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True) gprofiler_kwargs = copy(gprofiler_kwargs) for k in ["organism"]: if gprofiler_kwargs.get(k) is not None: raise ValueError( f"Argument `{k}` should be passed directly through `enrich`, " "not through `gprofiler_kwargs`") return gprofiler.profile(list(container), organism=org, **gprofiler_kwargs)
downreg = downreg['Gene'].astype(str).tolist() downreg[:] = map(str.strip, downreg) hyper = methyl_genes[methyl_genes['Methylation'] > 0] hyper = hyper['Gene'].tolist() hypo = methyl_genes[methyl_genes['Methylation'] < 0] hypo = hypo['Gene'].tolist() genelists = {'downreg': downreg, 'upreg': upreg, 'hyper': hyper, 'hypo': hypo} for i in genelists: print("Calculating", i, "enrichment...") enrichment = gp.profile(genelists[i], organism='mmusculus', significance_threshold_method='fdr', measure_underrepresentation=False, sources=['GO:BP']) enrichment.sort_values('p_value').to_csv(str(i) + "_enrichment.csv") locals()[str(i) + "_enrichment"] = enrichment print("Done") intersection_1 = pd.merge(downreg_enrichment, hyper_enrichment, how='inner', on=['native']) intersection_2 = pd.merge(upreg_enrichment, hypo_enrichment, how='inner', on=['native'])
utr3_100 = utr3[utr3['count'] > 100] cds_100 = cds[cds['count'] > 100] df = pd.read_table('/Users/nate/Projects/EBV_interactome/bl/discovery_cohort_spearman.tsv', index_col=0) mrnas = set(utr3_100[utr3_100['corr'] < -.27]['mrna']) | set(cds_100[cds_100['corr'] < -.27]['mrna']) mm = df[set(mrnas) & set(df.columns)] from gprofiler import GProfiler gp = GProfiler(user_agent='ExampleTool', return_dataframe=True ) a2 = gp.profile(query=list(mm['CSDE1'].sort_values().index[-500:]),sources=['GO:BP','GO:MP','REAC','KEGG','GO:CC']) #spearman cor = [] for mi, mr in zip(utr3['mir'], utr3['mrna']): cor.append(spearmanr(bl.loc[mr], mir.loc[mi])[0]) utr3['corr'] = cor cor = [] for mi, mr in zip(cds['mir'], cds['mrna']): cor.append(spearmanr(bl.loc[mr], mir.loc[mi])[0]) cds['corr'] = cor
# Let me now find the GO anaotation graphs for the proteins that have SaintExpress score>0.5 and BFDR<0.01 ''' This code asks you if you have one or more than one conditions. In case, you have one, it gives you only one conditional horizontal bar graphs. If not, it would compare the bar graphs. To have 2 conditions, you need to run this code 2 times with different outputs! ''' if ask_user == "YES": if number_of_conditions == 1: # Only one condition! getting GO annotation profiles of proteins that have >0.5 saint score and <0.01 BFDR score gp = GProfiler(return_dataframe=True) profiler = gp.profile(organism='hsapiens', query=gene_names) BP_profiler = profiler[profiler["source"] == "GO:BP"] CC_profiler = profiler[profiler["source"] == "GO:CC"] MF_profiler = profiler[profiler["source"] == "GO:MF"] BP_profiled = BP_profiler.sort_values(by=["p_value"]) CC_profiled = CC_profiler.sort_values(by=["p_value"]) MF_profiled = MF_profiler.sort_values(by=["p_value"]) location_BP = BP_profiled["name"].to_list()[0:10] p_BP = BP_profiled["p_value"].to_list()[0:10] logged_p_BP = [] for i in p_BP: x = -log(i, 10) logged_p_BP.append(x)
def run_gProfiler_gOST( gene_symbols_list:List[str], organism:str='hsapiens') -> pd.DataFrame: gp = GProfiler(return_dataframe=True) df = gp.profile(organism=organism, query=gene_symbols_list) return df