Exemple #1
0
def populate_go_annotations(totalAnnotations,session,engine):
    """
    read the annotation file into a dictionary
    This will take some time
    This function is intended for use with 
    http://www.geneontology.org/GO.format.gaf-2_0.shtml
    """

    timeStart = time.time()
    config = Configure()
    taxaList = config.log['taxa']
    toAdd = []
    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile,'rU')
    wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)]
    annotationCount = 0

    print("...loading mappers")
    termIdMap = goterm_mapper(session)
    taxaIdMap = taxa_mapper(session)
    uniprotIdMap = uniprot_mapper(session)
    print("...populating rows")

    def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations):

        ## remove invalid term ids
        if not termIdMap.has_key(goId):
            queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first()
            if queryTerm == None:
                return
            go_db_id = queryTerm.id
        else:
            go_db_id = termIdMap[goId]

        ## remove invalid uniprot ids
        if uniprotId and not mapper.has_key(uniprotId):
            return
        if uniprotId:
            uniprot_db_id = mapper[uniprotId]
        else:
            uniprot_db_id = None

        ## remove invalid gene ids
        if geneId and not mapper.has_key(geneId):
            return
        if geneId:
            gene_db_id = mapper[geneId]
        else:
            gene_db_id = None

        ## ignore annotations that have an outdated taxon
        if not taxaIdMap.has_key(taxon):
            ignoredAnnotations += 1
            return

        ## get the taxa foreign key
        taxon_db_id = taxaIdMap[taxon]

        toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode,
                      'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id,
                      'gene_id':gene_db_id,'taxa_id':taxon_db_id})

    ## add annotations from uniprot annotation file
    ignoredAnnotationsUniprot = 0
    print("...getting annotations from gene_association (uniprot)")
    for record in annotationFid:
        record = record[:-1].split("\t")

        ## check that it is a uniprot entry
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue
        
        uniprotId = record[1]
        dbObjectSymbol = record[2]
        goId = record[4]
        pubmedRefs = record[5]
        evidenceCode = record[6]
        aspect = record[8]
        uniprotEntry = record[10]
        goTermName = record[11]
        taxon = re.sub("taxon:","",record[12])
        date = record[13]
        assignedBy = record[14]

        if taxon not in taxaList:
            continue

        ## parse the uniprot Entry
        if re.search("\|",uniprotEntry):
            uniprotEntry = re.split("\|",uniprotEntry)[0]

        ## ignore annotations with multiple species
        if re.search("\|",taxon):
            continue

        ## update progress
        annotationCount += 1
        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd,
                    uniprotIdMap,ignoredAnnotationsUniprot)

        if len(toAdd) >= 100000: # 100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('committing final changes...')
    print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot))
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    del uniprotIdMap
    annotationFid.close()
    
    ## add annotations from gene2go
    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile,'rU')
    ignoredAnnotationsGene = 0 
    print("...getting annotations from gene2go")
    header = gene2goFid.next()
    geneIdMap = gene_mapper(session)
    toAdd = []

    for record in gene2goFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#",record[0]) or len(record) != 8:
            continue
    
        taxon = record[0]
        ncbiId = record[1]
        goId = record[2]
        evidenceCode = record[3]
        qualifier = record[4]
        go_term_description = record[5]
        pubmedRefs = record[6]
        go_aspect = record[7]
        annotationCount += 1

        if taxon not in taxaList:
            continue

        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd,
                    geneIdMap,ignoredAnnotationsGene)

        if len(toAdd) >= 100000: #100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene))
    print('committing final changes...')
    
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique go annotation entries were added."%annotationCount
    return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)
#!/usr/bin/python

import time,csv,re,sys
import numpy as np
from htsint.database import get_annotation_file, get_gene2go_file


annotationFile = get_annotation_file()
annotationFid = open(annotationFile,'rU')
annotsCount = 0
annotatedIds = {}
annots1,annots2 = 0,0

for record in annotationFid:
    record = record[:-1].split("\t")

    if record[0][0] == "!":
        continue
    if record[0] != 'UniProtKB':
        continue

    uniprotEntry = record[10]
    if re.search("\|",uniprotEntry):
        uniprotEntry = re.split("\|",uniprotEntry)[0]

    #for i,r in enumerate(record):
    #    print i,r

    #if annots1 == 300:
    #    sys.exit()
Exemple #3
0
#!/usr/bin/python

import time, csv, re, sys
import numpy as np
from htsint.database import get_annotation_file, get_gene2go_file

annotationFile = get_annotation_file()
annotationFid = open(annotationFile, 'rU')
annotsCount = 0
annotatedIds = {}
annots1, annots2 = 0, 0

for record in annotationFid:
    record = record[:-1].split("\t")

    if record[0][0] == "!":
        continue
    if record[0] != 'UniProtKB':
        continue

    uniprotEntry = record[10]
    if re.search("\|", uniprotEntry):
        uniprotEntry = re.split("\|", uniprotEntry)[0]

    #for i,r in enumerate(record):
    #    print i,r

    #if annots1 == 300:
    #    sys.exit()

    if re.search("\-1", record[1]):
Exemple #4
0
def populate_go_annotations(totalAnnotations,session,engine):
    """
    read the annotation file into a dictionary
    This will take some time
    This function is intended for use with 
    http://www.geneontology.org/GO.format.gaf-2_0.shtml
    """

    timeStart = time.time()
    config = Configure()
    taxaList = config.log['taxa']
    toAdd = []
    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile,'rU')
    wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)]
    annotationCount = 0

    print("...loading mappers")
    termIdMap = goterm_mapper(session)
    taxaIdMap = taxa_mapper(session)
    uniprotIdMap = uniprot_mapper(session)
    print("...populating rows")

    def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations):

        ## remove invalid term ids
        if not goId in termIdMap:
            queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first()
            if queryTerm == None:
                return
            go_db_id = queryTerm.id
        else:
            go_db_id = termIdMap[goId]

        ## remove invalid uniprot ids
        if uniprotId and uniprotId not in mapper:
            return
        if uniprotId:
            uniprot_db_id = mapper[uniprotId]
        else:
            uniprot_db_id = None

        ## remove invalid gene ids
        if geneId and geneId not in mapper:
            return
        if geneId:
            gene_db_id = mapper[geneId]
        else:
            gene_db_id = None

        ## ignore annotations that have an outdated taxon
        if taxon not in taxaIdMap:
            ignoredAnnotations += 1
            return

        ## get the taxa foreign key
        taxon_db_id = taxaIdMap[taxon]

        toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode,
                      'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id,
                      'gene_id':gene_db_id,'taxa_id':taxon_db_id})

    ## add annotations from uniprot annotation file
    ignoredAnnotationsUniprot = 0
    print("...getting annotations from gene_association (uniprot)")
    for record in annotationFid:
        record = record[:-1].split("\t")

        ## check that it is a uniprot entry
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue
        
        uniprotId = record[1]
        dbObjectSymbol = record[2]
        goId = record[4]
        pubmedRefs = record[5]
        evidenceCode = record[6]
        aspect = record[8]
        uniprotEntry = record[10]
        goTermName = record[11]
        taxon = re.sub("taxon:","",record[12])
        date = record[13]
        assignedBy = record[14]

        if taxon not in taxaList:
            continue

        ## parse the uniprot Entry
        if re.search("\|",uniprotEntry):
            uniprotEntry = re.split("\|",uniprotEntry)[0]

        ## ignore annotations with multiple species
        if re.search("\|",taxon):
            continue

        ## update progress
        annotationCount += 1
        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd,
                    uniprotIdMap,ignoredAnnotationsUniprot)

        if len(toAdd) >= 100000: # 100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('committing final changes...')
    print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot))
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    del uniprotIdMap
    annotationFid.close()
    
    ## add annotations from gene2go
    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile,'rU')
    ignoredAnnotationsGene = 0 
    print("...getting annotations from gene2go")
    header = gene2goFid.__next__()
    geneIdMap = gene_mapper(session)
    toAdd = []

    for record in gene2goFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#",record[0]) or len(record) != 8:
            continue
    
        taxon = record[0]
        ncbiId = record[1]
        goId = record[2]
        evidenceCode = record[3]
        qualifier = record[4]
        go_term_description = record[5]
        pubmedRefs = record[6]
        go_aspect = record[7]
        annotationCount += 1

        if taxon not in taxaList:
            continue

        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd,
                    geneIdMap,ignoredAnnotationsGene)

        if len(toAdd) >= 100000: #100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene))
    print('committing final changes...')
    
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique go annotation entries were added."%annotationCount
    return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)