Exemple #1
0
    def run_study(self, study, **kws):
        """Run Gene Ontology Enrichment Study (GOEA) on study ids."""
        # Key-word arguments:
        methods = Methods(kws['methods']) if 'methods' in kws else self.methods
        alpha = kws['alpha'] if 'alpha' in kws else self.alpha
        log = kws['log'] if 'log' in kws else self.log
        # Calculate uncorrected pvalues
        results = self.get_pval_uncorr(study, log)
        if not results:
            return []

        if log is not None:
            log.write("  {MSG}\n".format(MSG="\n  ".join(self.get_results_msg(results, study))))

        # Do multipletest corrections on uncorrected pvalues and update results
        self._run_multitest_corr(results, methods, alpha, study, log)

        for rec in results:
            # get go term for name and level
            rec.set_goterm(self.obo_dag)

        # 'keep_if' can be used to keep only significant GO terms. Example:
        #     >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant
        #     >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if)
        if 'keep_if' in kws:
            keep_if = kws['keep_if']
            results = [r for r in results if keep_if(r)]

        # Default sort order: First, sort by BP, MF, CC. Second, sort by pval
        results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected])
        return results # list of GOEnrichmentRecord objects
Exemple #2
0
    def __init__(self,
                 pop,
                 assoc,
                 obo_dag,
                 propagate_counts=True,
                 alpha=.05,
                 methods=None,
                 **kws):
        self.name = kws.get('name', 'GOEA')
        print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format(
            OBJNAME=self.name))
        self.log = kws['log'] if 'log' in kws else sys.stdout
        self._run_multitest = {
            'local': self._run_multitest_local,
            'statsmodels': self._run_multitest_statsmodels
        }
        self.pop = set(pop)
        self.pop_n = len(pop)
        self.assoc = assoc
        self.obo_dag = obo_dag
        self.alpha = alpha
        if methods is None:
            methods = ["bonferroni", "sidak", "holm"]
        self.methods = Methods(methods)
        self.pval_obj = FisherFactory(**kws).pval_obj

        if propagate_counts:
            update_association(assoc, obo_dag, kws.get('relationships', None))
        ## BROAD broad_goids = get_goids_to_remove(kws.get('remove_goids'))
        ## BROAD if broad_goids:
        ## BROAD     assoc = self._remove_assc_goids(assoc, broad_goids)
        self.go2popitems = get_terms("population", pop, assoc, obo_dag,
                                     self.log)
Exemple #3
0
    def __init__(self,
                 pop,
                 assoc,
                 obo_dag,
                 propagate_counts=True,
                 alpha=.05,
                 methods=None,
                 **kws):
        self.log = kws['log'] if 'log' in kws else sys.stdout
        self._run_multitest = {
            'local': lambda iargs: self._run_multitest_local(iargs),
            'statsmodels': lambda iargs: self._run_multitest_statsmodels(iargs)
        }
        self.pop = pop
        self.pop_n = len(pop)
        self.assoc = assoc
        self.obo_dag = obo_dag
        self.alpha = alpha
        if methods is None:
            methods = ["bonferroni", "sidak", "holm"]
        self.methods = Methods(methods)
        self.pval_obj = FisherFactory(**kws).pval_obj

        if propagate_counts:
            sys.stderr.write("Propagating term counts to parents ..\n")
            obo_dag.update_association(assoc)
        self.go2popitems = get_terms("population", pop, assoc, obo_dag,
                                     self.log)
Exemple #4
0
    def run_study(self, study, **kws):
        """Run Gene Ontology Enrichment Study (GOEA) on study ids."""
        study_name = kws.get('name', 'current')
        log = self._get_log_or_prt(kws)
        if log:
            log.write(
                '\nRun {OBJNAME} Gene Ontology Analysis: {STU} study set of {N} IDs ...'
                .format(OBJNAME=self.name, N=len(study), STU=study_name))
        if not study:
            return []
        # Key-word arguments:
        methods = Methods(kws['methods']) if 'methods' in kws else self.methods
        alpha = kws['alpha'] if 'alpha' in kws else self.alpha
        # Calculate uncorrected pvalues
        results = self.get_pval_uncorr(study, log)
        if not results:
            return []

        if log is not None:
            log.write("  {MSG}\n".format(
                MSG="\n  ".join(self.get_results_msg(results, study))))

        # Do multipletest corrections on uncorrected pvalues and update results
        self._run_multitest_corr(results, methods, alpha, study, log)

        for rec in results:
            # get go term for name and level
            rec.set_goterm(self.obo_dag)

        # 'keep_if' can be used to keep only significant GO terms. Example:
        #     >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant
        #     >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if)
        if 'keep_if' in kws:
            keep_if = kws['keep_if']
            results = [r for r in results if keep_if(r)]

        # Default sort order:
        results.sort(key=lambda r: [r.enrichment, r.NS, r.p_uncorrected])
        return results  # list of GOEnrichmentRecord objects
def test_init_methods():
    """Test initializing methods."""
    mobj = Methods()
    assert mobj._srcmethod2fieldname == get_exp_fieldnames()
    assert mobj.getmsg_valid_methods() == get_expstr_fieldnames()
    assert mobj.methods == [mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni')]
    mobj._add_method_src('statsmodels', 'fdr_bh')
    assert mobj.methods == [
        mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='fdr_bh')]
    sm_methods = ['sm_{}'.format(m) for m in mobj.all_methods[1][1]] # statsmodels
    mobj._init_methods(sm_methods)
    assert mobj.methods == [
        mobj.NtMethodInfo(source='statsmodels', method='bonferroni', fieldname='sm_bonferroni'), 
        mobj.NtMethodInfo(source='statsmodels', method='sidak', fieldname='sm_sidak'), 
        mobj.NtMethodInfo(source='statsmodels', method='holm-sidak', fieldname='sm_holm-sidak'), 
        mobj.NtMethodInfo(source='statsmodels', method='holm', fieldname='sm_holm'), 
        mobj.NtMethodInfo(source='statsmodels', method='simes-hochberg', fieldname='sm_simes-hochberg'), 
        mobj.NtMethodInfo(source='statsmodels', method='hommel', fieldname='sm_hommel'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='sm_fdr_bh'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_by', fieldname='sm_fdr_by'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbh', fieldname='sm_fdr_tsbh'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbky', fieldname='sm_fdr_tsbky'), 
        mobj.NtMethodInfo(source='statsmodels', method='fdr_gbs', fieldname='sm_fdr_gbs')]
Exemple #6
0
    def _init_args(self):
        """Get enrichment arg parser."""

        #pylint: disable=invalid-name
        p = argparse.ArgumentParser(
            __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        p.add_argument('filenames',
                       type=str,
                       nargs=3,
                       help='data/study data/population data/association')
        p.add_argument(
            '--annofmt',
            default=None,
            type=str,
            help=('Annotation file format. '
                  'Not needed if type can be determined using filename'),
            choices=['gene2go', 'gaf', 'gpad', 'id2gos'])
        p.add_argument(
            '--taxid',
            default=9606,
            type=int,
            help=
            "When using NCBI's gene2go annotation file, specify desired taxid")
        p.add_argument('--alpha',
                       default=0.05,
                       type=float,
                       help='Test-wise alpha for multiple testing')
        p.add_argument(
            '--pval',
            default=.05,
            type=float,
            help='Only print results with uncorrected p-value < PVAL.')
        p.add_argument('--pval_field',
                       type=str,
                       help='Only print results when PVAL_FIELD < PVAL.')
        p.add_argument('--outfile',
                       default=None,
                       type=str,
                       help='Write enrichment results into xlsx or tsv file')
        p.add_argument('--ns',
                       default='BP,MF,CC',
                       type=str,
                       help='Limit GOEA to specified branch categories. '
                       'BP=Biological Process; '
                       'MF=Molecular Function; '
                       'CC=Cellular Component')
        p.add_argument(
            '--id2sym',
            default=None,
            type=str,
            help='ASCII file containing one geneid and its symbol per line')
        p.add_argument(
            '--sections',
            default=None,
            type=str,
            help=('Use sections file for printing grouped GOEA results. '
                  'Example SECTIONS values:\n'
                  'goatools.test_data.sections.gjoneska_pfenning \n'
                  'goatools/test_data/sections/gjoneska_pfenning.py \n'
                  'data/gjoneska_pfenning/sections_in.txt\n'))
        p.add_argument(
            '--outfile_detail',
            type=str,
            help=(
                'Write enrichment results into a text file \n'
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'
            ))
        p.add_argument(
            '--compare',
            dest='compare',
            default=False,
            action='store_true',
            help="the population file as a comparison group. if this "
            "flag is specified, the population is used as the study "
            "plus the `population/comparison`")
        p.add_argument(
            '--ratio',
            dest='ratio',
            type=float,
            default=None,
            help="only show values where the difference between study "
            "and population ratios is greater than this. useful for "
            "excluding GO categories with small differences, but "
            "containing large numbers of genes. should be a value "
            "between 1 and 2. ")
        p.add_argument('--indent',
                       dest='indent',
                       default=False,
                       action='store_true',
                       help="indent GO terms")
        p.add_argument('--obo',
                       default="go-basic.obo",
                       type=str,
                       help="Specifies location and name of the obo file")
        p.add_argument('--no_propagate_counts',
                       default=False,
                       action='store_true',
                       help="Do not propagate counts to parent terms")
        p.add_argument('--method',
                       default="bonferroni,sidak,holm,fdr_bh",
                       type=str,
                       help=Methods().getmsg_valid_methods())
        p.add_argument('--pvalcalc',
                       default="fisher",
                       type=str,
                       help=str(FisherFactory()))
        p.add_argument(
            '--min_overlap',
            default=0.7,
            type=float,
            help=
            "Check that a minimum amount of study genes are in the population")
        p.add_argument('--goslim',
                       default='goslim_generic.obo',
                       type=str,
                       help="The GO slim file is used when grouping GO terms.")
        p.add_argument(
            '--ev_inc',
            type=str,
            help=
            "Include specified evidence codes and groups separated by commas")
        p.add_argument(
            '--ev_exc',
            type=str,
            help=
            "Exclude specified evidence codes and groups separated by commas")
        p.add_argument('--ev_help',
                       dest='ev_help',
                       action='store_false',
                       help="Print all Evidence codes, with descriptions")
        p.add_argument('--ev_help_short',
                       dest='ev_help_short',
                       action='store_false',
                       help="Print all Evidence codes")

        if len(sys.argv) == 1:
            sys.exit(not p.print_help())
        self._prt_evidence_codes(set(sys.argv[1:]))
        args = p.parse_args()  # Namespace object from argparse
        self._check_input_files(args, p)
        return args
Exemple #7
0
                 "containing large numbers of genes. should be a value "
                 "between 1 and 2. ")
    p.add_argument('--fdr', dest='fdr', default=False,
                 action='store_true',
                 help="Calculate the false discovery rate (alt. to the "
                 "Bonferroni but slower)")
    p.add_argument('--indent', dest='indent', default=False,
                 action='store_true', help="indent GO terms")
    p.add_argument('--obo', default="go-basic.obo", type=str,
                 help="Specifies location and name of the obo file")
    p.add_argument('--no_propagate_counts', default=False, action='store_true',
                 help="Do not propagate counts to parent terms")
    p.add_argument('--outfile', default=None, type=str,
                 help="Write enrichment results into xlsx or tsv file")
    p.add_argument('--method', default="bonferroni,sidak,holm", type=str,
                 help=Methods().getmsg_valid_methods())

    args = p.parse_args()
    check_input_files(args, p)

    min_ratio = args.ratio
    if min_ratio is not None:
        assert 1 <= min_ratio <= 2

    study_fn, pop_fn, assoc_fn = args.filenames
    study, pop = read_geneset(study_fn, pop_fn, compare=args.compare)
    print("Study: {0} vs. Population {1}".format(len(study), len(pop)), file=sys.stderr)

    if not args.compare:  # sanity check
        if len(pop) < len(study):
            exit("\nERROR: The study file contains more elements than the population file. "
Exemple #8
0
    def _init_args(self):
        """Get enrichment arg parser."""

        #pylint: disable=invalid-name
        p = argparse.ArgumentParser(
            __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        p.add_argument('filenames',
                       type=str,
                       nargs=3,
                       help='data/study data/population data/association')
        p.add_argument('--alpha',
                       default=0.05,
                       type=float,
                       help='Test-wise alpha for multiple testing')
        p.add_argument(
            '--pval',
            default=.05,
            type=float,
            help='Only print results with uncorrected p-value < PVAL.')
        p.add_argument('--pval_field',
                       type=str,
                       help='Only print results when PVAL_FIELD < PVAL.')
        p.add_argument('--outfile',
                       default=None,
                       type=str,
                       help='Write enrichment results into xlsx or tsv file')
        p.add_argument(
            '--sections',
            default=None,
            type=str,
            help=('Use sections file for printing grouped GOEA results. '
                  'Example SECTIONS values:\n'
                  'goatools.test_data.sections.gjoneska_pfenning \n'
                  'goatools/test_data/sections/gjoneska_pfenning.py \n'
                  'data/gjoneska_pfenning/sections_in.txt\n'))
        p.add_argument(
            '--outfile_detail',
            type=str,
            help=(
                'Write enrichment results into a text file \n'
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'
            ))
        p.add_argument(
            '--compare',
            dest='compare',
            default=False,
            action='store_true',
            help="the population file as a comparison group. if this "
            "flag is specified, the population is used as the study "
            "plus the `population/comparison`")
        p.add_argument(
            '--ratio',
            dest='ratio',
            type=float,
            default=None,
            help="only show values where the difference between study "
            "and population ratios is greater than this. useful for "
            "excluding GO categories with small differences, but "
            "containing large numbers of genes. should be a value "
            "between 1 and 2. ")
        p.add_argument('--indent',
                       dest='indent',
                       default=False,
                       action='store_true',
                       help="indent GO terms")
        p.add_argument('--obo',
                       default="go-basic.obo",
                       type=str,
                       help="Specifies location and name of the obo file")
        p.add_argument('--no_propagate_counts',
                       default=False,
                       action='store_true',
                       help="Do not propagate counts to parent terms")
        p.add_argument('--method',
                       default="bonferroni,sidak,holm,fdr_bh",
                       type=str,
                       help=Methods().getmsg_valid_methods())
        p.add_argument('--pvalcalc',
                       default="fisher",
                       type=str,
                       help=str(FisherFactory()))
        p.add_argument(
            '--min_overlap',
            default=0.7,
            type=float,
            help=
            "Check that a minimum amount of study genes are in the population")
        p.add_argument('--goslim',
                       default='goslim_generic.obo',
                       type=str,
                       help="The GO slim file is used when grouping GO terms.")

        if len(sys.argv) == 1:
            sys.exit(not p.print_help())

        args = p.parse_args()  # Namespace object from argparse
        self._check_input_files(args, p)
        return args
def get_arg_parser():
    """Get enrichment arg parser."""

    #pylint: disable=invalid-name
    p = argparse.ArgumentParser(
        __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    p.add_argument('filenames',
                   type=str,
                   nargs=3,
                   help='data/study data/population data/association')
    p.add_argument('--alpha',
                   default=0.05,
                   type=float,
                   help="Test-wise alpha for multiple testing ")
    p.add_argument(
        '--pval',
        default=.05,
        type=float,
        help="Only print out when uncorrected p-value < this value.")
    p.add_argument('--compare',
                   dest='compare',
                   default=False,
                   action='store_true',
                   help="the population file as a comparison group. if this "
                   "flag is specified, the population is used as the study "
                   "plus the `population/comparison`")
    p.add_argument('--ratio',
                   dest='ratio',
                   type=float,
                   default=None,
                   help="only show values where the difference between study "
                   "and population ratios is greater than this. useful for "
                   "excluding GO categories with small differences, but "
                   "containing large numbers of genes. should be a value "
                   "between 1 and 2. ")
    p.add_argument('--indent',
                   dest='indent',
                   default=False,
                   action='store_true',
                   help="indent GO terms")
    p.add_argument('--obo',
                   default="go-basic.obo",
                   type=str,
                   help="Specifies location and name of the obo file")
    p.add_argument('--no_propagate_counts',
                   default=False,
                   action='store_true',
                   help="Do not propagate counts to parent terms")
    p.add_argument('--outfile',
                   default=None,
                   type=str,
                   help="Write enrichment results into xlsx or tsv file")
    p.add_argument('--method',
                   default="bonferroni,sidak,holm,fdr_bh",
                   type=str,
                   help=Methods().getmsg_valid_methods())
    p.add_argument('--pvalcalc',
                   default="fisher",
                   type=str,
                   help=str(FisherFactory()))
    p.add_argument(
        '--min_overlap',
        default=0.7,
        type=float,
        help="Check that a minimum amount of study genes are in the population"
    )

    if len(sys.argv) == 1:
        sys.exit(not p.print_help())

    args = p.parse_args()  # Namespace object from argparse
    _check_input_files(args, p)
    return args
Exemple #10
0
          help=('Write enrichment results into a text file \n'
                'containing the following information: \n'
                '1) GOEA GO terms, grouped into sections \n\n'
                '2) List of genes and ASCII art showing section membership \n'
                '3) Detailed list of each gene and GO terms w/their P-values \n'),
          abbrev='D')
@plac.opt('ratio', type=float,
          help="only show values where the difference between study "
               "and population ratios is greater than this. useful for "
               "excluding GO categories with small differences, but "
               "containing large numbers of genes. should be a value "
               "between 1 and 2. ")
@plac.opt('relationships', abbrev='R',
          help=('Propagate counts up user-specified relationships ( comma separated ), which include: '
                '{RELS}').format(RELS=' '.join(RELATIONSHIP_SET)))
@plac.opt('method', type=str, help=Methods().getmsg_valid_methods())
@plac.opt('pvalcalc', type=str, help=str(FisherFactory()), abbrev='calc')
@plac.opt('min_overlap', type=float,
          help="Check that a minimum amount of study genes are in the population",
          abbrev='M')
@plac.opt('goslim', type=str, help="The GO slim file is used when grouping GO terms.")
def run(name='human', taxid=9606, download=False,
        alpha=0.05, pval=.05, field='p_uncorrected', outfile='result.tsv',
        ns='BP,MF,CC', id2sym=None, detail='', sections=None,
        compare=False, ratio=None, prtstd=False, indent=False,
        noprop=False, relationship=False, relationships='', plot=False,
        enrich=False, method="bonferroni,sidak,holm,fdr_bh", pvalcalc="fisher",
        min_overlap=0.7, goslim='goslim_generic.obo', inc='', exc='',
        *study):
    # Construct arguments to pass down to GO.
    go_params = dict(alpha=alpha, pval=pval, pval_field=field, outfile=outfile,