Beispiel #1
0
print('{N} of {M:,} results were significant'.format(N=len(goea_quiet_sig),
                                                     M=len(goea_quiet_all)))

print('Significant results: {E} enriched, {P} purified'.format(
    E=sum(1 for r in goea_quiet_sig if r.enrichment == 'e'),
    P=sum(1 for r in goea_quiet_sig if r.enrichment == 'p')))

ctr = cx.Counter([r.NS for r in goea_quiet_sig])
print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
    TOTAL=len(goea_quiet_sig),
    BP=ctr['BP'],  # biological_process
    MF=ctr['MF'],  # molecular_function
    CC=ctr['CC']))  # cellular_component

#goeaobj.wr_xlsx("CDK1_test.xlsx", goea_quiet_sig)
goeaobj.wr_txt("CDK1_test.txt", goea_quiet_sig)

goid_subset = [
    'GO:0003723',  # MF D04 RNA binding (32 genes)
    'GO:0044822',  # MF D05 poly(A) RNA binding (86 genes)
    'GO:0003729',  # MF D06 mRNA binding (11 genes)
    'GO:0019843',  # MF D05 rRNA binding (6 genes)
    'GO:0003746',  # MF D06 translation elongation factor activity (5 genes)
]
plot_gos(
    "nbt3102_MF_RNA_genecnt.png",
    goid_subset,  # Source GO ids
    obodag,
    goea_results=goea_quiet_all)  # Use pvals for coloring
Beispiel #2
0
def pullGOenrichment(inputFile, project):
    GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT

    obo_fname = download_go_basic_obo()

    fin_gene2go = download_ncbi_associations()

    obodag = GODag("go-basic.obo")

    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    print(len(GeneID2nt_hum))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(),  # List of human protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}
    with open(inputFile, 'r') as infile:
        input_genes = csv.reader(infile)
        for line in input_genes:
            geneid = line[0]
            symbol = line[1]
            if geneid:
                geneid2symbol[int(geneid)] = symbol

    infile.close()

    geneids_study = geneid2symbol.keys()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    import collections as cx
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC']))  # cellular_component

    goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig)
    goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)