def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha log = kws['log'] if 'log' in kws else self.log # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: First, sort by BP, MF, CC. Second, sort by pval results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.name = kws.get('name', 'GOEA') print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format( OBJNAME=self.name)) self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': self._run_multitest_local, 'statsmodels': self._run_multitest_statsmodels } self.pop = set(pop) self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: update_association(assoc, obo_dag, kws.get('relationships', None)) ## BROAD broad_goids = get_goids_to_remove(kws.get('remove_goids')) ## BROAD if broad_goids: ## BROAD assoc = self._remove_assc_goids(assoc, broad_goids) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': lambda iargs: self._run_multitest_local(iargs), 'statsmodels': lambda iargs: self._run_multitest_statsmodels(iargs) } self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" study_name = kws.get('name', 'current') log = self._get_log_or_prt(kws) if log: log.write( '\nRun {OBJNAME} Gene Ontology Analysis: {STU} study set of {N} IDs ...' .format(OBJNAME=self.name, N=len(study), STU=study_name)) if not study: return [] # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format( MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: results.sort(key=lambda r: [r.enrichment, r.NS, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects
def test_init_methods(): """Test initializing methods.""" mobj = Methods() assert mobj._srcmethod2fieldname == get_exp_fieldnames() assert mobj.getmsg_valid_methods() == get_expstr_fieldnames() assert mobj.methods == [mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni')] mobj._add_method_src('statsmodels', 'fdr_bh') assert mobj.methods == [ mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni'), mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='fdr_bh')] sm_methods = ['sm_{}'.format(m) for m in mobj.all_methods[1][1]] # statsmodels mobj._init_methods(sm_methods) assert mobj.methods == [ mobj.NtMethodInfo(source='statsmodels', method='bonferroni', fieldname='sm_bonferroni'), mobj.NtMethodInfo(source='statsmodels', method='sidak', fieldname='sm_sidak'), mobj.NtMethodInfo(source='statsmodels', method='holm-sidak', fieldname='sm_holm-sidak'), mobj.NtMethodInfo(source='statsmodels', method='holm', fieldname='sm_holm'), mobj.NtMethodInfo(source='statsmodels', method='simes-hochberg', fieldname='sm_simes-hochberg'), mobj.NtMethodInfo(source='statsmodels', method='hommel', fieldname='sm_hommel'), mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='sm_fdr_bh'), mobj.NtMethodInfo(source='statsmodels', method='fdr_by', fieldname='sm_fdr_by'), mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbh', fieldname='sm_fdr_tsbh'), mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbky', fieldname='sm_fdr_tsbky'), mobj.NtMethodInfo(source='statsmodels', method='fdr_gbs', fieldname='sm_fdr_gbs')]
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument( '--annofmt', default=None, type=str, help=('Annotation file format. ' 'Not needed if type can be determined using filename'), choices=['gene2go', 'gaf', 'gpad', 'id2gos']) p.add_argument( '--taxid', default=9606, type=int, help= "When using NCBI's gene2go annotation file, specify desired taxid") p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument('--ns', default='BP,MF,CC', type=str, help='Limit GOEA to specified branch categories. ' 'BP=Biological Process; ' 'MF=Molecular Function; ' 'CC=Cellular Component') p.add_argument( '--id2sym', default=None, type=str, help='ASCII file containing one geneid and its symbol per line') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") p.add_argument( '--ev_inc', type=str, help= "Include specified evidence codes and groups separated by commas") p.add_argument( '--ev_exc', type=str, help= "Exclude specified evidence codes and groups separated by commas") p.add_argument('--ev_help', dest='ev_help', action='store_false', help="Print all Evidence codes, with descriptions") p.add_argument('--ev_help_short', dest='ev_help_short', action='store_false', help="Print all Evidence codes") if len(sys.argv) == 1: sys.exit(not p.print_help()) self._prt_evidence_codes(set(sys.argv[1:])) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
"containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--fdr', dest='fdr', default=False, action='store_true', help="Calculate the false discovery rate (alt. to the " "Bonferroni but slower)") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm", type=str, help=Methods().getmsg_valid_methods()) args = p.parse_args() check_input_files(args, p) min_ratio = args.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 study_fn, pop_fn, assoc_fn = args.filenames study, pop = read_geneset(study_fn, pop_fn, compare=args.compare) print("Study: {0} vs. Population {1}".format(len(study), len(pop)), file=sys.stderr) if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. "
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
def get_arg_parser(): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help="Test-wise alpha for multiple testing ") p.add_argument( '--pval', default=.05, type=float, help="Only print out when uncorrected p-value < this value.") p.add_argument('--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument('--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help="Check that a minimum amount of study genes are in the population" ) if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse _check_input_files(args, p) return args
help=('Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n'), abbrev='D') @plac.opt('ratio', type=float, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") @plac.opt('relationships', abbrev='R', help=('Propagate counts up user-specified relationships ( comma separated ), which include: ' '{RELS}').format(RELS=' '.join(RELATIONSHIP_SET))) @plac.opt('method', type=str, help=Methods().getmsg_valid_methods()) @plac.opt('pvalcalc', type=str, help=str(FisherFactory()), abbrev='calc') @plac.opt('min_overlap', type=float, help="Check that a minimum amount of study genes are in the population", abbrev='M') @plac.opt('goslim', type=str, help="The GO slim file is used when grouping GO terms.") def run(name='human', taxid=9606, download=False, alpha=0.05, pval=.05, field='p_uncorrected', outfile='result.tsv', ns='BP,MF,CC', id2sym=None, detail='', sections=None, compare=False, ratio=None, prtstd=False, indent=False, noprop=False, relationship=False, relationships='', plot=False, enrich=False, method="bonferroni,sidak,holm,fdr_bh", pvalcalc="fisher", min_overlap=0.7, goslim='goslim_generic.obo', inc='', exc='', *study): # Construct arguments to pass down to GO. go_params = dict(alpha=alpha, pval=pval, pval_field=field, outfile=outfile,