assoc = read_associations(assoc_fn) methods = args.method.split(",") if args.fdr: methods.append("fdr") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if) else: g.wr_tsv(outfile, results, prt_if=prt_if) # Copyright (C) 2010-2016, H Tang et al., All rights reserved.
"background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if) else: g.wr_tsv(outfile, results, prt_if=prt_if) # Copyright (C) 2010-2016, H Tang et al., All rights reserved.
assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") if args.pval is not None: # Only print results when uncorrected p-value < this value.A num_orig = len(results) results = [r for r in results if r.p_uncorrected <= args.pval] sys.stdout.write("{N:7,} of {M:,} results have uncorrected P-values <= {PVAL}=pval\n".format( N=len(results), M=num_orig, PVAL=args.pval)) for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, indent=args.indent) else: g.wr_tsv(outfile, results, indent=args.indent) # Copyright (C) 2010-2018, H Tang et al. All rights reserved.
def goe( genelist, go_file, goa_file, bg=None, nmin=5, conversion=None, evidence_set={ 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA', 'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM' }): """Finds GO enrichment with goatools (0.7.11 tested). **WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression. Parameters ------------ genelist: list of str Genes to search for enrichment. go_file: str File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)). goa_file: str File path for GO associations. See parameter **conversion**. bg: list of str Background genes. nmin: int Minimum number of principal genes required in GO. conversion: tuple Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation. * name_from: Gene naming system of genelist. For gene names, use 'symbol,alias'. * name_to: Gene naming system of goa_file. Examples: * Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz). * Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz). * species: Species for gene name conversion. Examples: 'human', 'mouse'. evidence_set: set of str `GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias. Returns ---------- goe: pandas.DataFrame GO enrichment. gotop: str Top enriched GO ID genes: list of str or None Intersection list of genes in gotop and also bg. None if bg is None. """ from tempfile import NamedTemporaryFile from os import linesep from goatools.go_enrichment import GOEnrichmentStudy from goatools.obo_parser import GODag from goatools.associations import read_gaf from collections import defaultdict import itertools from biothings_client import get_client import pandas as pd import logging assert type(genelist) is list and len(genelist) > 0 if nmin < 1: nmin = 1 bg0 = bg # Convert gene names if conversion is not None: assert len(conversion) == 3 name_from, name_to, species = conversion mg = get_client('gene') ans = set(genelist) if bg is not None: t1 = set(bg) assert len(ans - t1) == 0 ans |= t1 ans = list(ans) ans = mg.querymany(ans, scopes=name_from, fields=name_to, species=species) t1 = set(['query', '_score', name_to.split('.')[0]]) ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans)) ans = sorted(ans, key=lambda x: x['_score']) convert = {x['query']: x for x in ans} for xi in name_to.split('.'): convert = filter(lambda x: xi in x[1], convert.items()) convert = {x[0]: x[1][xi] for x in convert} convert = { x[0]: x[1] if type(x[1]) is str else x[1][0] for x in convert.items() } genelist2 = list( set([convert[x] for x in filter(lambda x: x in convert, genelist)])) if bg is not None: bg = list( set([convert[x] for x in filter(lambda x: x in convert, bg)])) t1 = set(genelist) converti = list(filter(lambda x: x[0] in t1, convert.items())) t1 = defaultdict(list) for xi in converti: t1[xi[1]].append(xi[0]) converti = dict(t1) t1 = defaultdict(list) for xi in convert.items(): t1[xi[1]].append(xi[0]) convertia = dict(t1) else: genelist2 = genelist # Load GO DAG and association files logging.debug('Reading GO DAG file ' + go_file) godag = GODag(go_file) logging.debug('Reading GO association file ' + goa_file) goa = read_gaf(goa_file, evidence_set=evidence_set) if bg is None: bg = list(goa.keys()) # Compute enrichment goe = GOEnrichmentStudy(bg, goa, godag) ans = goe.run_study(genelist2) # Format output with NamedTemporaryFile() as f: goe.wr_tsv(f.name, ans) ans = f.read() ans = ans.decode() ans = [x.split('\t') for x in ans.split(linesep)] if len(ans[-1]) < 2: ans = ans[:-1] if len(ans) == 0 or len(ans[0]) == 0: raise ValueError('No enrichment found. Check your input ID type.') ans[0][0] = ans[0][0].strip('# ') ans = pd.DataFrame(ans[1:], columns=ans[0]) ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'], axis=1, inplace=True) for xj in ['p_uncorrected', 'p_bonferroni']: ans[xj] = pd.to_numeric(ans[xj], errors='raise') ans['depth'] = pd.to_numeric(ans['depth'], errors='raise', downcast='unsigned') # Odds ratio column and sort column ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio( ans['ratio_in_pop']) ans = ans[[ 'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio', 'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items' ]] ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', '')) # Convert back study_items if conversion is not None: ans['study_items'] = ans['study_items'].apply(lambda x: ','.join( list( itertools.chain.from_iterable( [converti[y] for y in x.split(',')]))) if len(x) > 0 else x) ans.sort_values('p_uncorrected', inplace=True) # Get top enriched GO by P-value gotop = ans[ (ans['odds_ratio'] > 1) & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)] if len(gotop) == 0: raise ValueError('No GO enrichment found for given criteria.') gotop = str(gotop.iloc[0]['GO']) if bg0 is not None: # Children GOs gos = set([gotop] + list(godag.query_term(gotop).get_all_children())) # Look for genes genes = list( filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0, goa)) if conversion is not None: genes = [ convertia[x] for x in filter(lambda x: x in convertia, genes) ] genes = list(set(list(itertools.chain.from_iterable(genes)))) genes = set(genes) genes = list(filter(lambda x: x in genes, bg0)) else: genes = None return (ans, gotop, genes)
def enrich(gene2go: str, study: str, obo: str, population: str = None, geneid2symbol: str = None, correct='fdr_bh', alpha=0.05, top=20, goea_out=None, dag_out=None, dpi=300, show_gene_limit=6, only_plot_sig=False): """ Go enrichment based on goatools :param gene2go: a file with two columns: gene_id \t go_term_id :param study: a file with at least one column, first column contains gene id, second columns is regulation direction :param obo: go-basic file download from GeneOntology :param population: a file with each row contains one gene; default to use all genes in gene2go file as population :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot :param correct: pvalue adjustment method: Method used for testing and adjustment of pvalues. Can be either the full name or initial letters. Available methods are: - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : step down method using Sidak adjustments - `holm` : step-down method using Bonferroni adjustments - `simes-hochberg` : step-up method (independent) - `hommel` : closed method based on Simes tests (non-negative) - `fdr_bh` : Benjamini/Hochberg (non-negative) - `fdr_by` : Benjamini/Yekutieli (negative) - `fdr_tsbh` : two stage fdr correction (non-negative) - `fdr_tsbky` : two stage fdr correction (non-negative) :param alpha: fdr cutoff, default 0.05 :param top: n top go terms to plot, sorted by corrected pvalue :param goea_out: output enrichment result file :param dag_out: dag figure file :param dpi: resolution of image, no effect for svg :param show_gene_limit: the max number of gene in a node to show :param only_plot_sig: only plot dag for significantly enriched terms :return: None """ if str(correct) == '3': correct = 'fdr_bh' if geneid2symbol: geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol) if x.strip()) else: geneid2symbol = dict() obo = GODag(obo, optional_attrs=['relationship', 'is_a']) gene2go = read_associations(gene2go) study_genes = [x.strip().split()[0] for x in open(study)] try: reg_dict = dict(x.strip().split()[:2] for x in open(study)) except: reg_dict = {x.strip(): '' for x in open(study)} if not population: population = gene2go.keys() else: population = [ x.strip().split()[0] for x in open(population) if x.strip() ] goea_obj = GOEnrichmentStudy(population, gene2go, obo, propagate_counts=False, alpha=alpha, methods=('fdr_bh', )) keep_if = lambda r: r.ratio_in_study[0] != 0 goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if) goea_out = goea_out or study + '.goea.xls' goea_obj.wr_tsv(goea_out, goea_results_all) def func(y): results = [] genes = [x.strip() for x in y.split(',')] for gene in genes: tmp = [gene] if gene in reg_dict: tmp.append(reg_dict[gene]) if gene in geneid2symbol: tmp.append(geneid2symbol[gene]) results.append('|'.join(tmp)) return ';'.join(results) # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(',')) table = pd.read_table(goea_out, header=0, index_col=0) # 重新校正pvalue, 修改内容 fdr = multipletests(table['p_uncorrected'], method=correct)[1] table['p_fdr_bh'] = fdr # 修改goea_result_all方便后续的画图 for r, fdr in zip(goea_results_all, fdr): r.p_fdr_bh = fdr table.columns = [ x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns ] table['enrichment'] = [ 'e' if x <= alpha else 'p' for x in table['p_corrected'] ] table['study_items'] = table.loc[:, 'study_items'].map(func) # table = table.sort_values(by=['p_corrected', 'p_uncorrected']) table.to_csv(goea_out, header=True, index=True, sep='\t') # -------------------plot dag------------------------ for each in ['BP', 'MF', 'CC']: if only_plot_sig: goea_results_sig = table[table['enrichment'] == 'e'] else: goea_results_sig = table.copy() goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each] if not goea_results_sig.shape[0]: print(f"No significant term to plot for {each} ") return if goea_results_sig.shape[0] >= top: goea_results_sig = goea_results_sig.iloc[:top] goid_subset = list(goea_results_sig.index) # t = obo[goid_subset[5]] # for k, v in t.relationship.items(): # print(t, k, type(v), list(v)[0].id) # print(dag_out[:-4]+'.'+each+dag_out[-4:]) dag_out = dag_out or study + '.goea.dag.svg' plot_gos( dag_out[:-4] + '.' + each + dag_out[-4:], goid_subset, # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现 obo, goea_results= goea_results_all, # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname) # We can further configure the plot... id2symbol=geneid2symbol, # Print study gene Symbols, not GeneIDs study_items=show_gene_limit, # Only max 6 gene Symbols on GO terms items_p_line=3, # Print 3 genes per line) dpi=0 if dag_out.endswith('svg') else dpi, # title="Directed Graph of enriched {} terms".format(each) )
obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent) else: g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent) # Copyright (C) 2010-2017, H Tang et al. All rights reserved.
"background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent) else: g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent) # Copyright (C) 2010-2016, H Tang et al. All rights reserved.