Esempio n. 1
0
    def __init__(self,
                 pop,
                 assoc,
                 obo_dag,
                 propagate_counts=True,
                 alpha=.05,
                 methods=None,
                 **kws):
        self.name = kws.get('name', 'GOEA')
        print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format(
            OBJNAME=self.name))
        self.log = kws['log'] if 'log' in kws else sys.stdout
        self._run_multitest = {
            'local': self._run_multitest_local,
            'statsmodels': self._run_multitest_statsmodels
        }
        self.pop = set(pop)
        self.pop_n = len(pop)
        self.assoc = assoc
        self.obo_dag = obo_dag
        self.alpha = alpha
        if methods is None:
            methods = ["bonferroni", "sidak", "holm"]
        self.methods = Methods(methods)
        self.pval_obj = FisherFactory(**kws).pval_obj

        if propagate_counts:
            update_association(assoc, obo_dag, kws.get('relationships', None))
        ## BROAD broad_goids = get_goids_to_remove(kws.get('remove_goids'))
        ## BROAD if broad_goids:
        ## BROAD     assoc = self._remove_assc_goids(assoc, broad_goids)
        self.go2popitems = get_terms("population", pop, assoc, obo_dag,
                                     self.log)
Esempio n. 2
0
    def __init__(self,
                 pop,
                 assoc,
                 obo_dag,
                 propagate_counts=True,
                 alpha=.05,
                 methods=None,
                 **kws):
        self.log = kws['log'] if 'log' in kws else sys.stdout
        self._run_multitest = {
            'local': lambda iargs: self._run_multitest_local(iargs),
            'statsmodels': lambda iargs: self._run_multitest_statsmodels(iargs)
        }
        self.pop = pop
        self.pop_n = len(pop)
        self.assoc = assoc
        self.obo_dag = obo_dag
        self.alpha = alpha
        if methods is None:
            methods = ["bonferroni", "sidak", "holm"]
        self.methods = Methods(methods)
        self.pval_obj = FisherFactory(**kws).pval_obj

        if propagate_counts:
            sys.stderr.write("Propagating term counts to parents ..\n")
            obo_dag.update_association(assoc)
        self.go2popitems = get_terms("population", pop, assoc, obo_dag,
                                     self.log)
Esempio n. 3
0
def calc_qval(study_n, pop_n,
              pop, assoc, term_pop, obo_dag, T=500):
    """Generate p-value distribution for FDR based on resampling."""
    from goatools.pvalcalc import FisherFactory
    from goatools.ratio import count_terms
    sys.stderr.write("Generate p-value distribution for FDR "
                     "based on resampling (this might take a while)\n")
    distribution = []
    calc_pvalue = FisherFactory().pval_obj.calc_pvalue
    for i in range(T):
        new_study = random.sample(pop, study_n)
        new_term_study = count_terms(new_study, assoc, obo_dag)

        smallest_p = 1
        for term, study_count in list(new_term_study.items()):
            pop_count = term_pop[term]
            p_uncorrected = calc_pvalue(study_count,
                                        study_n,
                                        pop_count,
                                        pop_n)
            if p_uncorrected < smallest_p:
                smallest_p = p_uncorrected

        distribution.append(smallest_p)
        if i % 10 == 0:
            sys.stderr.write("Sample {0} / {1}: "
                             "p-value {2}\n".format(i, T, smallest_p))
    return distribution
Esempio n. 4
0
    def _init_args(self):
        """Get enrichment arg parser."""

        #pylint: disable=invalid-name
        p = argparse.ArgumentParser(
            __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        p.add_argument('filenames',
                       type=str,
                       nargs=3,
                       help='data/study data/population data/association')
        p.add_argument(
            '--annofmt',
            default=None,
            type=str,
            help=('Annotation file format. '
                  'Not needed if type can be determined using filename'),
            choices=['gene2go', 'gaf', 'gpad', 'id2gos'])
        p.add_argument(
            '--taxid',
            default=9606,
            type=int,
            help=
            "When using NCBI's gene2go annotation file, specify desired taxid")
        p.add_argument('--alpha',
                       default=0.05,
                       type=float,
                       help='Test-wise alpha for multiple testing')
        p.add_argument(
            '--pval',
            default=.05,
            type=float,
            help='Only print results with uncorrected p-value < PVAL.')
        p.add_argument('--pval_field',
                       type=str,
                       help='Only print results when PVAL_FIELD < PVAL.')
        p.add_argument('--outfile',
                       default=None,
                       type=str,
                       help='Write enrichment results into xlsx or tsv file')
        p.add_argument('--ns',
                       default='BP,MF,CC',
                       type=str,
                       help='Limit GOEA to specified branch categories. '
                       'BP=Biological Process; '
                       'MF=Molecular Function; '
                       'CC=Cellular Component')
        p.add_argument(
            '--id2sym',
            default=None,
            type=str,
            help='ASCII file containing one geneid and its symbol per line')
        p.add_argument(
            '--sections',
            default=None,
            type=str,
            help=('Use sections file for printing grouped GOEA results. '
                  'Example SECTIONS values:\n'
                  'goatools.test_data.sections.gjoneska_pfenning \n'
                  'goatools/test_data/sections/gjoneska_pfenning.py \n'
                  'data/gjoneska_pfenning/sections_in.txt\n'))
        p.add_argument(
            '--outfile_detail',
            type=str,
            help=(
                'Write enrichment results into a text file \n'
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'
            ))
        p.add_argument(
            '--compare',
            dest='compare',
            default=False,
            action='store_true',
            help="the population file as a comparison group. if this "
            "flag is specified, the population is used as the study "
            "plus the `population/comparison`")
        p.add_argument(
            '--ratio',
            dest='ratio',
            type=float,
            default=None,
            help="only show values where the difference between study "
            "and population ratios is greater than this. useful for "
            "excluding GO categories with small differences, but "
            "containing large numbers of genes. should be a value "
            "between 1 and 2. ")
        p.add_argument('--indent',
                       dest='indent',
                       default=False,
                       action='store_true',
                       help="indent GO terms")
        p.add_argument('--obo',
                       default="go-basic.obo",
                       type=str,
                       help="Specifies location and name of the obo file")
        p.add_argument('--no_propagate_counts',
                       default=False,
                       action='store_true',
                       help="Do not propagate counts to parent terms")
        p.add_argument('--method',
                       default="bonferroni,sidak,holm,fdr_bh",
                       type=str,
                       help=Methods().getmsg_valid_methods())
        p.add_argument('--pvalcalc',
                       default="fisher",
                       type=str,
                       help=str(FisherFactory()))
        p.add_argument(
            '--min_overlap',
            default=0.7,
            type=float,
            help=
            "Check that a minimum amount of study genes are in the population")
        p.add_argument('--goslim',
                       default='goslim_generic.obo',
                       type=str,
                       help="The GO slim file is used when grouping GO terms.")
        p.add_argument(
            '--ev_inc',
            type=str,
            help=
            "Include specified evidence codes and groups separated by commas")
        p.add_argument(
            '--ev_exc',
            type=str,
            help=
            "Exclude specified evidence codes and groups separated by commas")
        p.add_argument('--ev_help',
                       dest='ev_help',
                       action='store_false',
                       help="Print all Evidence codes, with descriptions")
        p.add_argument('--ev_help_short',
                       dest='ev_help_short',
                       action='store_false',
                       help="Print all Evidence codes")

        if len(sys.argv) == 1:
            sys.exit(not p.print_help())
        self._prt_evidence_codes(set(sys.argv[1:]))
        args = p.parse_args()  # Namespace object from argparse
        self._check_input_files(args, p)
        return args
Esempio n. 5
0
                   "and population ratios is greater than this. useful for "
                   "excluding GO categories with small differences, but "
                   "containing large numbers of genes. should be a value "
                   "between 1 and 2. ")
    p.add_argument('--indent', dest='indent', default=False,
                   action='store_true', help="indent GO terms")
    p.add_argument('--obo', default="go-basic.obo", type=str,
                   help="Specifies location and name of the obo file")
    p.add_argument('--no_propagate_counts', default=False, action='store_true',
                   help="Do not propagate counts to parent terms")
    p.add_argument('--outfile', default=None, type=str,
                   help="Write enrichment results into xlsx or tsv file")
    p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str,
                   help=Methods().getmsg_valid_methods())
    p.add_argument('--pvalcalc', default="fisher", type=str,
                   help=str(FisherFactory()))

    if len(sys.argv) == 1:
        sys.exit(not p.print_help())

    args = p.parse_args()
    check_input_files(args, p)

    min_ratio = args.ratio
    if min_ratio is not None:
        assert 1 <= min_ratio <= 2

    study_fn, pop_fn, assoc_fn = args.filenames
    study, pop = read_geneset(study_fn, pop_fn, compare=args.compare)
    sys.stderr.write("Study: {0} vs. Population {1}\n".format(
        len(study), len(pop)))
Esempio n. 6
0
    def _init_args(self):
        """Get enrichment arg parser."""

        #pylint: disable=invalid-name
        p = argparse.ArgumentParser(
            __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        p.add_argument('filenames',
                       type=str,
                       nargs=3,
                       help='data/study data/population data/association')
        p.add_argument('--alpha',
                       default=0.05,
                       type=float,
                       help='Test-wise alpha for multiple testing')
        p.add_argument(
            '--pval',
            default=.05,
            type=float,
            help='Only print results with uncorrected p-value < PVAL.')
        p.add_argument('--pval_field',
                       type=str,
                       help='Only print results when PVAL_FIELD < PVAL.')
        p.add_argument('--outfile',
                       default=None,
                       type=str,
                       help='Write enrichment results into xlsx or tsv file')
        p.add_argument(
            '--sections',
            default=None,
            type=str,
            help=('Use sections file for printing grouped GOEA results. '
                  'Example SECTIONS values:\n'
                  'goatools.test_data.sections.gjoneska_pfenning \n'
                  'goatools/test_data/sections/gjoneska_pfenning.py \n'
                  'data/gjoneska_pfenning/sections_in.txt\n'))
        p.add_argument(
            '--outfile_detail',
            type=str,
            help=(
                'Write enrichment results into a text file \n'
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'
            ))
        p.add_argument(
            '--compare',
            dest='compare',
            default=False,
            action='store_true',
            help="the population file as a comparison group. if this "
            "flag is specified, the population is used as the study "
            "plus the `population/comparison`")
        p.add_argument(
            '--ratio',
            dest='ratio',
            type=float,
            default=None,
            help="only show values where the difference between study "
            "and population ratios is greater than this. useful for "
            "excluding GO categories with small differences, but "
            "containing large numbers of genes. should be a value "
            "between 1 and 2. ")
        p.add_argument('--indent',
                       dest='indent',
                       default=False,
                       action='store_true',
                       help="indent GO terms")
        p.add_argument('--obo',
                       default="go-basic.obo",
                       type=str,
                       help="Specifies location and name of the obo file")
        p.add_argument('--no_propagate_counts',
                       default=False,
                       action='store_true',
                       help="Do not propagate counts to parent terms")
        p.add_argument('--method',
                       default="bonferroni,sidak,holm,fdr_bh",
                       type=str,
                       help=Methods().getmsg_valid_methods())
        p.add_argument('--pvalcalc',
                       default="fisher",
                       type=str,
                       help=str(FisherFactory()))
        p.add_argument(
            '--min_overlap',
            default=0.7,
            type=float,
            help=
            "Check that a minimum amount of study genes are in the population")
        p.add_argument('--goslim',
                       default='goslim_generic.obo',
                       type=str,
                       help="The GO slim file is used when grouping GO terms.")

        if len(sys.argv) == 1:
            sys.exit(not p.print_help())

        args = p.parse_args()  # Namespace object from argparse
        self._check_input_files(args, p)
        return args
Esempio n. 7
0
def get_arg_parser():
    """Get enrichment arg parser."""

    #pylint: disable=invalid-name
    p = argparse.ArgumentParser(
        __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    p.add_argument('filenames',
                   type=str,
                   nargs=3,
                   help='data/study data/population data/association')
    p.add_argument('--alpha',
                   default=0.05,
                   type=float,
                   help="Test-wise alpha for multiple testing ")
    p.add_argument(
        '--pval',
        default=.05,
        type=float,
        help="Only print out when uncorrected p-value < this value.")
    p.add_argument('--compare',
                   dest='compare',
                   default=False,
                   action='store_true',
                   help="the population file as a comparison group. if this "
                   "flag is specified, the population is used as the study "
                   "plus the `population/comparison`")
    p.add_argument('--ratio',
                   dest='ratio',
                   type=float,
                   default=None,
                   help="only show values where the difference between study "
                   "and population ratios is greater than this. useful for "
                   "excluding GO categories with small differences, but "
                   "containing large numbers of genes. should be a value "
                   "between 1 and 2. ")
    p.add_argument('--indent',
                   dest='indent',
                   default=False,
                   action='store_true',
                   help="indent GO terms")
    p.add_argument('--obo',
                   default="go-basic.obo",
                   type=str,
                   help="Specifies location and name of the obo file")
    p.add_argument('--no_propagate_counts',
                   default=False,
                   action='store_true',
                   help="Do not propagate counts to parent terms")
    p.add_argument('--outfile',
                   default=None,
                   type=str,
                   help="Write enrichment results into xlsx or tsv file")
    p.add_argument('--method',
                   default="bonferroni,sidak,holm,fdr_bh",
                   type=str,
                   help=Methods().getmsg_valid_methods())
    p.add_argument('--pvalcalc',
                   default="fisher",
                   type=str,
                   help=str(FisherFactory()))
    p.add_argument(
        '--min_overlap',
        default=0.7,
        type=float,
        help="Check that a minimum amount of study genes are in the population"
    )

    if len(sys.argv) == 1:
        sys.exit(not p.print_help())

    args = p.parse_args()  # Namespace object from argparse
    _check_input_files(args, p)
    return args
Esempio n. 8
0
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'),
          abbrev='D')
@plac.opt('ratio', type=float,
          help="only show values where the difference between study "
               "and population ratios is greater than this. useful for "
               "excluding GO categories with small differences, but "
               "containing large numbers of genes. should be a value "
               "between 1 and 2. ")
@plac.opt('relationships', abbrev='R',
          help=('Propagate counts up user-specified relationships ( comma separated ), which include: '
                '{RELS}').format(RELS=' '.join(RELATIONSHIP_SET)))
@plac.opt('method', type=str, help=Methods().getmsg_valid_methods())
@plac.opt('pvalcalc', type=str, help=str(FisherFactory()), abbrev='calc')
@plac.opt('min_overlap', type=float,
          help="Check that a minimum amount of study genes are in the population",
          abbrev='M')
@plac.opt('goslim', type=str, help="The GO slim file is used when grouping GO terms.")
def run(name='human', taxid=9606, download=False,
        alpha=0.05, pval=.05, field='p_uncorrected', outfile='result.tsv',
        ns='BP,MF,CC', id2sym=None, detail='', sections=None,
        compare=False, ratio=None, prtstd=False, indent=False,
        noprop=False, relationship=False, relationships='', plot=False,
        enrich=False, method="bonferroni,sidak,holm,fdr_bh", pvalcalc="fisher",
        min_overlap=0.7, goslim='goslim_generic.obo', inc='', exc='',
        *study):
    # Construct arguments to pass down to GO.
    go_params = dict(alpha=alpha, pval=pval, pval_field=field, outfile=outfile,
                     ns=ns, id2sym=id2sym, outfile_detail=detail,