def init_goea(log):
    """Read Ontologies and Annotations once."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    # 2. Run enrichment analysis
    goeaobj = GOEA(obo_dag, assoc, log)
    goeaobj.set_population(popul_ids)
    return goeaobj
def run_all():
    """Run a variety of enrichment analysis."""
    fout_log = GOEA.get_fout_log()
    log = sys.stdout if fout_log is None else open(fout_log, 'w')
    goeaobj = init_goea(log) # Initialize once
    test_bonferronis(log, goeaobj)
    close_log(fout_log, log)
def test_bonferroni(fout_log=None):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    log = sys.stdout if fout_log is None else open(fout_log, 'w')
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEA(obo_dag, assoc, log)
    goea.set_population(popul_ids)
    goea.set_params(alpha=0.05, method='bonferroni')
    results_nt = goea.find_enrichment(study_ids)

    # ---------------------------------------------------------------------
    # Print results 3 ways: to screen, to tsv (tab-separated file), to xlsx (Excel spreadsheet)
    fout_tsv = "goea_bonferroni.tsv"
    fout_xls = "goea_bonferroni.xlsx"
    
    # print these in tsv and xlsx
    print_names = ['NS', 'study_cnt', 'pval_uncor', 'bonferroni_star', 'bonferroni', 'level', 'depth', 'GO', 'name'] 
    field_names = print_names + ['bonferroni_sig'] # collect this. Used in prt_if
    # Optional user customizable sort: 
    #     Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest last.
    sort_by = lambda nt: [nt.NS, -1*nt.pval_uncor]
    # 1. Print results to screen using format in prtfmt. For example:
    #
    #      BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation
    #      BP  9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process
    #      BP  2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process
    #      BP  2 1.223e-02 L04 D08 GO:0006301 postreplication repair
    #      BP  2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process
    #      BP  2 1.492e-02 L04 D06 GO:0006909 phagocytosis
    #      BP  2 1.492e-02 L03 D03 GO:0051322 anaphase
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = " ".join(["{NS} {study_cnt:2} {pval_uncor:5.3e}",
             "{bonferroni_star:1} {bonferroni:5.3e} L{level:02} D{depth:02} {GO} {name}\n"])
    prt_if = lambda nt: nt.bonferroni_sig # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, prt_if=prt_if)

    # 2. Write results to tsv file
    # Optional user defined formatting for specific fields
    fld2fmt = {'bonferroni':'{:8.2e}', 'pval_uncor':'{:8.2e}'} 
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1*nt.depth] 
    goea.wr_tsv(fout_tsv, results_nt, field_names, 
        prt_if=prt_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', '*', 'bonferroni', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(fout_xls, results_nt, field_names, 
        # optional key-word args (ie, kwargs, kws)
        prt_if=prt_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, print_names=print_names) 
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = " ".join(["{NS} {study_cnt:2} {pval_uncor:5.3e}",
             "{bonferroni_star:1} {bonferroni:5.3e} L{level:02} D{depth:02} {GO} {name}\n"])
    prt_if = lambda nt: nt.bonferroni_sig # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, prt_if=prt_if)

    # 2. Write results to tsv file
    # Optional user defined formatting for specific fields
    fld2fmt = {'bonferroni':'{:8.2e}', 'pval_uncor':'{:8.2e}'} 
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1*nt.depth] 
    goea.wr_tsv(fout_tsv, results_nt, field_names, 
        prt_if=prt_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', '*', 'bonferroni', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(fout_xls, results_nt, field_names, 
        # optional key-word args (ie, kwargs, kws)
        prt_if=prt_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, print_names=print_names) 
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))

if __name__ == '__main__':
    test_bonferroni(GOEA.get_fout_log())

# Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved.
Example #5
0
def test_fdr_bh(fout_log=None):
    """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results"""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    log = sys.stdout if fout_log is None else open(fout_log, "w")
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEA(obo_dag, assoc, log)
    goea.set_population(popul_ids)
    goea.set_params(alpha=0.05, method="fdr_bh")
    results_nt = goea.find_enrichment(study_ids)

    # ---------------------------------------------------------------------
    # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet)
    fout_tsv = "goea_fdr_bh.tsv"
    fout_xls = "goea_fdr_bh.xlsx"

    field_names = ["NS", "study_cnt", "fdr_bh", "level", "depth", "GO", "name", "fdr_bh_sig"]  # collect these
    print_names = ["NS", "study_cnt", "fdr_bh", "level", "depth", "GO", "name"]  # print these in tsv and xlsx
    # Optional user customizable sort:
    #     Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first.
    sort_by = lambda nt: [nt.NS, nt.fdr_bh]
    # 1. Print results to screen using format in prtfmt. For example:
    #
    #      BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation
    #      BP  9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process
    #      BP  2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process
    #      BP  2 1.223e-02 L04 D08 GO:0006301 postreplication repair
    #      BP  2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process
    #      BP  2 1.492e-02 L04 D06 GO:0006909 phagocytosis
    #      BP  2 1.492e-02 L03 D03 GO:0051322 anaphase
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n"
    keep_if = lambda nt: nt.fdr_bh_sig  # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, keep_if=keep_if)

    # 2. Write results to tsv file
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1 * nt.depth]
    fld2fmt = {"fdr_bh": "{:8.2e}"}  # Optional user defined formatting for specific fields
    goea.wr_tsv(
        fout_tsv, results_nt, field_names, keep_if=keep_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names
    )

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ["NS", "Cnt", "fdr_bh", "L", "D", "Term", "Ontology Term Name"]
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(
        fout_xls,
        results_nt,
        field_names,
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if,
        sort_by=sort_by,
        hdrs=hdrs,
        fld2fmt=fld2fmt,
        print_names=print_names,
    )
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))
Example #6
0
    sort_by = lambda nt: [nt.NS, -1 * nt.depth]
    fld2fmt = {"fdr_bh": "{:8.2e}"}  # Optional user defined formatting for specific fields
    goea.wr_tsv(
        fout_tsv, results_nt, field_names, keep_if=keep_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names
    )

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ["NS", "Cnt", "fdr_bh", "L", "D", "Term", "Ontology Term Name"]
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(
        fout_xls,
        results_nt,
        field_names,
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if,
        sort_by=sort_by,
        hdrs=hdrs,
        fld2fmt=fld2fmt,
        print_names=print_names,
    )
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))


if __name__ == "__main__":
    test_fdr_bh(GOEA.get_fout_log())

# Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved.
Example #7
0
def test_fdr_bh(fout_log=None):
    """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results"""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    log = sys.stdout if fout_log is None else open(fout_log, 'w')
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEA(obo_dag, assoc, log)
    goea.set_population(popul_ids)
    goea.set_params(alpha=0.05, method='fdr_bh')
    results_nt = goea.find_enrichment(study_ids)

    # ---------------------------------------------------------------------
    # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet)
    fout_tsv = "goea_fdr_bh.tsv"
    fout_xls = "goea_fdr_bh.xlsx"

    field_names = [
        'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name',
        'fdr_bh_sig'
    ]  # collect these
    print_names = [
        'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name'
    ]  # print these in tsv and xlsx
    # Optional user customizable sort:
    #     Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first.
    sort_by = lambda nt: [nt.NS, nt.fdr_bh]
    # 1. Print results to screen using format in prtfmt. For example:
    #
    #      BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation
    #      BP  9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process
    #      BP  2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process
    #      BP  2 1.223e-02 L04 D08 GO:0006301 postreplication repair
    #      BP  2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process
    #      BP  2 1.492e-02 L04 D06 GO:0006909 phagocytosis
    #      BP  2 1.492e-02 L03 D03 GO:0051322 anaphase
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n"
    keep_if = lambda nt: nt.fdr_bh_sig  # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log,
                 results_nt,
                 field_names,
                 prtfmt,
                 sort_by=sort_by,
                 keep_if=keep_if)

    # 2. Write results to tsv file
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1 * nt.depth]
    fld2fmt = {
        'fdr_bh': '{:8.2e}'
    }  # Optional user defined formatting for specific fields
    goea.wr_tsv(fout_tsv,
                results_nt,
                field_names,
                keep_if=keep_if,
                sort_by=sort_by,
                fld2fmt=fld2fmt,
                print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(
        fout_xls,
        results_nt,
        field_names,
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if,
        sort_by=sort_by,
        hdrs=hdrs,
        fld2fmt=fld2fmt,
        print_names=print_names)
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))
Example #8
0
                results_nt,
                field_names,
                keep_if=keep_if,
                sort_by=sort_by,
                fld2fmt=fld2fmt,
                print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(
        fout_xls,
        results_nt,
        field_names,
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if,
        sort_by=sort_by,
        hdrs=hdrs,
        fld2fmt=fld2fmt,
        print_names=print_names)
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))


if __name__ == '__main__':
    test_fdr_bh(GOEA.get_fout_log())

# Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved.