def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.name = kws.get('name', 'GOEA') print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format( OBJNAME=self.name)) self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': self._run_multitest_local, 'statsmodels': self._run_multitest_statsmodels } self.pop = set(pop) self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: update_association(assoc, obo_dag, kws.get('relationships', None)) ## BROAD broad_goids = get_goids_to_remove(kws.get('remove_goids')) ## BROAD if broad_goids: ## BROAD assoc = self._remove_assc_goids(assoc, broad_goids) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': lambda iargs: self._run_multitest_local(iargs), 'statsmodels': lambda iargs: self._run_multitest_statsmodels(iargs) } self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def calc_qval(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500): """Generate p-value distribution for FDR based on resampling.""" from goatools.pvalcalc import FisherFactory from goatools.ratio import count_terms sys.stderr.write("Generate p-value distribution for FDR " "based on resampling (this might take a while)\n") distribution = [] calc_pvalue = FisherFactory().pval_obj.calc_pvalue for i in range(T): new_study = random.sample(pop, study_n) new_term_study = count_terms(new_study, assoc, obo_dag) smallest_p = 1 for term, study_count in list(new_term_study.items()): pop_count = term_pop[term] p_uncorrected = calc_pvalue(study_count, study_n, pop_count, pop_n) if p_uncorrected < smallest_p: smallest_p = p_uncorrected distribution.append(smallest_p) if i % 10 == 0: sys.stderr.write("Sample {0} / {1}: " "p-value {2}\n".format(i, T, smallest_p)) return distribution
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument( '--annofmt', default=None, type=str, help=('Annotation file format. ' 'Not needed if type can be determined using filename'), choices=['gene2go', 'gaf', 'gpad', 'id2gos']) p.add_argument( '--taxid', default=9606, type=int, help= "When using NCBI's gene2go annotation file, specify desired taxid") p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument('--ns', default='BP,MF,CC', type=str, help='Limit GOEA to specified branch categories. ' 'BP=Biological Process; ' 'MF=Molecular Function; ' 'CC=Cellular Component') p.add_argument( '--id2sym', default=None, type=str, help='ASCII file containing one geneid and its symbol per line') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") p.add_argument( '--ev_inc', type=str, help= "Include specified evidence codes and groups separated by commas") p.add_argument( '--ev_exc', type=str, help= "Exclude specified evidence codes and groups separated by commas") p.add_argument('--ev_help', dest='ev_help', action='store_false', help="Print all Evidence codes, with descriptions") p.add_argument('--ev_help_short', dest='ev_help_short', action='store_false', help="Print all Evidence codes") if len(sys.argv) == 1: sys.exit(not p.print_help()) self._prt_evidence_codes(set(sys.argv[1:])) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
"and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() check_input_files(args, p) min_ratio = args.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 study_fn, pop_fn, assoc_fn = args.filenames study, pop = read_geneset(study_fn, pop_fn, compare=args.compare) sys.stderr.write("Study: {0} vs. Population {1}\n".format( len(study), len(pop)))
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
def get_arg_parser(): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help="Test-wise alpha for multiple testing ") p.add_argument( '--pval', default=.05, type=float, help="Only print out when uncorrected p-value < this value.") p.add_argument('--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument('--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help="Check that a minimum amount of study genes are in the population" ) if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse _check_input_files(args, p) return args
'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n'), abbrev='D') @plac.opt('ratio', type=float, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") @plac.opt('relationships', abbrev='R', help=('Propagate counts up user-specified relationships ( comma separated ), which include: ' '{RELS}').format(RELS=' '.join(RELATIONSHIP_SET))) @plac.opt('method', type=str, help=Methods().getmsg_valid_methods()) @plac.opt('pvalcalc', type=str, help=str(FisherFactory()), abbrev='calc') @plac.opt('min_overlap', type=float, help="Check that a minimum amount of study genes are in the population", abbrev='M') @plac.opt('goslim', type=str, help="The GO slim file is used when grouping GO terms.") def run(name='human', taxid=9606, download=False, alpha=0.05, pval=.05, field='p_uncorrected', outfile='result.tsv', ns='BP,MF,CC', id2sym=None, detail='', sections=None, compare=False, ratio=None, prtstd=False, indent=False, noprop=False, relationship=False, relationships='', plot=False, enrich=False, method="bonferroni,sidak,holm,fdr_bh", pvalcalc="fisher", min_overlap=0.7, goslim='goslim_generic.obo', inc='', exc='', *study): # Construct arguments to pass down to GO. go_params = dict(alpha=alpha, pval=pval, pval_field=field, outfile=outfile, ns=ns, id2sym=id2sym, outfile_detail=detail,