Ejemplo n.º 1
0
def populate_go_annotations(totalAnnotations,session,engine):
    """
    read the annotation file into a dictionary
    This will take some time
    This function is intended for use with 
    http://www.geneontology.org/GO.format.gaf-2_0.shtml
    """

    timeStart = time.time()
    config = Configure()
    taxaList = config.log['taxa']
    toAdd = []
    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile,'rU')
    wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)]
    annotationCount = 0

    print("...loading mappers")
    termIdMap = goterm_mapper(session)
    taxaIdMap = taxa_mapper(session)
    uniprotIdMap = uniprot_mapper(session)
    print("...populating rows")

    def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations):

        ## remove invalid term ids
        if not termIdMap.has_key(goId):
            queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first()
            if queryTerm == None:
                return
            go_db_id = queryTerm.id
        else:
            go_db_id = termIdMap[goId]

        ## remove invalid uniprot ids
        if uniprotId and not mapper.has_key(uniprotId):
            return
        if uniprotId:
            uniprot_db_id = mapper[uniprotId]
        else:
            uniprot_db_id = None

        ## remove invalid gene ids
        if geneId and not mapper.has_key(geneId):
            return
        if geneId:
            gene_db_id = mapper[geneId]
        else:
            gene_db_id = None

        ## ignore annotations that have an outdated taxon
        if not taxaIdMap.has_key(taxon):
            ignoredAnnotations += 1
            return

        ## get the taxa foreign key
        taxon_db_id = taxaIdMap[taxon]

        toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode,
                      'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id,
                      'gene_id':gene_db_id,'taxa_id':taxon_db_id})

    ## add annotations from uniprot annotation file
    ignoredAnnotationsUniprot = 0
    print("...getting annotations from gene_association (uniprot)")
    for record in annotationFid:
        record = record[:-1].split("\t")

        ## check that it is a uniprot entry
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue
        
        uniprotId = record[1]
        dbObjectSymbol = record[2]
        goId = record[4]
        pubmedRefs = record[5]
        evidenceCode = record[6]
        aspect = record[8]
        uniprotEntry = record[10]
        goTermName = record[11]
        taxon = re.sub("taxon:","",record[12])
        date = record[13]
        assignedBy = record[14]

        if taxon not in taxaList:
            continue

        ## parse the uniprot Entry
        if re.search("\|",uniprotEntry):
            uniprotEntry = re.split("\|",uniprotEntry)[0]

        ## ignore annotations with multiple species
        if re.search("\|",taxon):
            continue

        ## update progress
        annotationCount += 1
        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd,
                    uniprotIdMap,ignoredAnnotationsUniprot)

        if len(toAdd) >= 100000: # 100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('committing final changes...')
    print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot))
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    del uniprotIdMap
    annotationFid.close()
    
    ## add annotations from gene2go
    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile,'rU')
    ignoredAnnotationsGene = 0 
    print("...getting annotations from gene2go")
    header = gene2goFid.next()
    geneIdMap = gene_mapper(session)
    toAdd = []

    for record in gene2goFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#",record[0]) or len(record) != 8:
            continue
    
        taxon = record[0]
        ncbiId = record[1]
        goId = record[2]
        evidenceCode = record[3]
        qualifier = record[4]
        go_term_description = record[5]
        pubmedRefs = record[6]
        go_aspect = record[7]
        annotationCount += 1

        if taxon not in taxaList:
            continue

        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd,
                    geneIdMap,ignoredAnnotationsGene)

        if len(toAdd) >= 100000: #100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene))
    print('committing final changes...')
    
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique go annotation entries were added."%annotationCount
    return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)
Ejemplo n.º 2
0
def populate_go_annotations(totalAnnotations, session, engine):
    """
    read the annotation file into a dictionary
    This will take some time
    This function is intended for use with 
    http://www.geneontology.org/GO.format.gaf-2_0.shtml
    """

    timeStart = time.time()
    config = Configure()
    taxaList = config.log['taxa']
    toAdd = []
    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile, 'rU')
    wayPoints = [round(int(w)) for w in np.linspace(0, totalAnnotations, 20)]
    annotationCount = 0

    print("...loading mappers")
    termIdMap = goterm_mapper(session)
    taxaIdMap = taxa_mapper(session)
    uniprotIdMap = uniprot_mapper(session)
    print("...populating rows")

    def queue_entry(goId, evidenceCode, pubmedRefs, uniprotId, geneId, taxon,
                    toAdd, mapper, ignoredAnnotations):

        ## remove invalid term ids
        if not termIdMap.has_key(goId):
            queryTerm = session.query(GoTerm).filter_by(
                alternate_id=goId).first()
            if queryTerm == None:
                return
            go_db_id = queryTerm.id
        else:
            go_db_id = termIdMap[goId]

        ## remove invalid uniprot ids
        if uniprotId and not mapper.has_key(uniprotId):
            return
        if uniprotId:
            uniprot_db_id = mapper[uniprotId]
        else:
            uniprot_db_id = None

        ## remove invalid gene ids
        if geneId and not mapper.has_key(geneId):
            return
        if geneId:
            gene_db_id = mapper[geneId]
        else:
            gene_db_id = None

        ## ignore annotations that have an outdated taxon
        if not taxaIdMap.has_key(taxon):
            ignoredAnnotations += 1
            return

        ## get the taxa foreign key
        taxon_db_id = taxaIdMap[taxon]

        toAdd.append({
            'go_term_id': go_db_id,
            'evidence_code': evidenceCode,
            'pubmed_refs': pubmedRefs,
            'uniprot_id': uniprot_db_id,
            'gene_id': gene_db_id,
            'taxa_id': taxon_db_id
        })

    ## add annotations from uniprot annotation file
    ignoredAnnotationsUniprot = 0
    print("...getting annotations from gene_association (uniprot)")
    for record in annotationFid:
        record = record[:-1].split("\t")

        ## check that it is a uniprot entry
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue

        uniprotId = record[1]
        dbObjectSymbol = record[2]
        goId = record[4]
        pubmedRefs = record[5]
        evidenceCode = record[6]
        aspect = record[8]
        uniprotEntry = record[10]
        goTermName = record[11]
        taxon = re.sub("taxon:", "", record[12])
        date = record[13]
        assignedBy = record[14]

        if taxon not in taxaList:
            continue

        ## parse the uniprot Entry
        if re.search("\|", uniprotEntry):
            uniprotEntry = re.split("\|", uniprotEntry)[0]

        ## ignore annotations with multiple species
        if re.search("\|", taxon):
            continue

        ## update progress
        annotationCount += 1
        if annotationCount in wayPoints:
            print("\t%s / %s" % (annotationCount, totalAnnotations))

        queue_entry(goId, evidenceCode, pubmedRefs, uniprotEntry, None, taxon,
                    toAdd, uniprotIdMap, ignoredAnnotationsUniprot)

        if len(toAdd) >= 100000:  # 100000
            with engine.begin() as connection:
                connection.execute(
                    GoAnnotation.__table__.insert().values(toAdd))
            toAdd = []

    print('committing final changes...')
    print('ignored annotations after uniprot... %s' %
          (ignoredAnnotationsUniprot))
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().values(toAdd))

    del uniprotIdMap
    annotationFid.close()

    ## add annotations from gene2go
    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile, 'rU')
    ignoredAnnotationsGene = 0
    print("...getting annotations from gene2go")
    header = gene2goFid.next()
    geneIdMap = gene_mapper(session)
    toAdd = []

    for record in gene2goFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#", record[0]) or len(record) != 8:
            continue

        taxon = record[0]
        ncbiId = record[1]
        goId = record[2]
        evidenceCode = record[3]
        qualifier = record[4]
        go_term_description = record[5]
        pubmedRefs = record[6]
        go_aspect = record[7]
        annotationCount += 1

        if taxon not in taxaList:
            continue

        if annotationCount in wayPoints:
            print("\t%s / %s" % (annotationCount, totalAnnotations))

        queue_entry(goId, evidenceCode, pubmedRefs, None, ncbiId, taxon, toAdd,
                    geneIdMap, ignoredAnnotationsGene)

        if len(toAdd) >= 100000:  #100000
            with engine.begin() as connection:
                connection.execute(
                    GoAnnotation.__table__.insert().values(toAdd))
            toAdd = []

    print('ignored annotations after gene2go... %s' % (ignoredAnnotationsGene))
    print('committing final changes...')

    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().values(toAdd))

    timeStr = "...total time taken: %s" % time.strftime(
        '%H:%M:%S', time.gmtime(time.time() - timeStart))
    addedStr = "...%s unique go annotation entries were added." % annotationCount
    return timeStr, addedStr, (ignoredAnnotationsUniprot,
                               ignoredAnnotationsGene)
Ejemplo n.º 3
0
def populate_uniprot_table(lineCount,session,engine):
    """
    populate the uniprot table with entries from idmappings
    """

    config = Configure()
    taxaList = config.log['taxa']
    timeStart = time.time()
    totalLines,totalRecords = 0,0
    idmappingFile = get_idmapping_file()
    idmappingFid = open(idmappingFile,'rb')
    reader = csv.reader(idmappingFid,delimiter="\t")
    ac2kbMap,toAdd = {},{}
    wayPoints = [round(int(w)) for w in np.linspace(0,lineCount,20)]

    print("getting mappers...")
    geneIdMap = gene_mapper(session)
    taxonIdMap = taxa_mapper(session)
    print("mappers loaded... %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

    def queue_entries(toAdd,geneIdMap,taxonIdMap,engine):

        toCommit = []

        for uniprotKbEntry, entry in toAdd.iteritems():
            db_gene_id = None
            db_taxa_id = None
            db_gene_taxa_id = None

            ## convert the gene id to a database key (check old names if we cannot find it)
            if entry['gene-id'] == None:
                pass
            elif geneIdMap.has_key(entry['gene-id']):
                db_gene_id = geneIdMap[entry['gene-id']]
            elif not geneIdMap.has_key(entry['gene-id']):
                _geneIds = [re.sub("\s+","",_ncid) for _ncid in entry['gene-id'].split(";")]
                db_gene_id = None
        
                for _gid in _geneIds:
                    if geneIdMap.has_key(_gid):
                        db_gene_id= _gid

            ## convert the taxa id to a database key
            if entry['ncbi-taxa-id'] and taxonIdMap.has_key(entry['ncbi-taxa-id']):
                db_taxa_id = taxonIdMap[entry['ncbi-taxa-id']]

            ## check that the linked gene taxa is the same as the entry taxa
            #if db_gene_id:
            #    db_gene_taxa_id = session.query(Gene).filter_by(id=db_gene_id).first().taxa_id
            #if db_taxa_id and db_gene_id:
            #    if db_taxa_id != db_gene_taxa_id:
            #        print("WARNING: two taxa present in single uniprot entry? %s %s "%(uniprotKbEntry,\
            #                                                                           entry['gene-id']))

            ## if no taxa was provdied use the one assocated with the linked gene
            #if not db_taxa_id:
            #    dgeneQuery = session.query(Gene).filter_by(id=db_gene_id).first()
            #    if dgeneQuery:
            #        db_taxa_id = dgeneQuery.taxa_id 

            ## ensure we are in appropriate taxa
            if entry['ncbi-taxa-id'] not in taxaList:
                continue

            ## ready the uniprot-ac and refseq rows
            entry['uniprot-ac'] = list(entry['uniprot-ac'])
            if len(entry['uniprot-ac']) == 0:
                entry['uniprot-ac'] = None
            elif  len(entry['uniprot-ac']) == 1:
                entry['uniprot-ac'] = entry['uniprot-ac'][0]
            else:
                entry['uniprot-ac'] = ";".join(entry['uniprot-ac'])

            entry['refseq'] = list(entry['refseq'])
            if len(entry['refseq']) == 0:
                entry['refseq'] = None
            elif  len(entry['refseq']) == 1:
                entry['refseq'] = entry['refseq'][0]
            else:
                entry['refseq'] = ";".join(entry['refseq'])

            ## commit to db
            toCommit.append({'uniprot_ac':entry['uniprot-ac'],'uniprot_entry':uniprotKbEntry,
                             'refseq':entry['refseq'],'taxa_id':db_taxa_id,'gene_id':db_gene_id})
        if len(toCommit) > 0:
            with engine.begin() as connection:
                connection.execute(Uniprot.__table__.insert().
                                   values(toCommit))

    ## parse the idmapping file into the db
    for record in reader:

        if len(record) != 3:
            continue

        uniprotKbAc,uniprotKbEntry,ncbiId,refseq,ncbiTaxaId = None,None,None,None,None
        uniprotKbAc = record[0]
        totalLines += 1
        if totalLines in wayPoints:
            print("\t%s / %s"%(totalLines,lineCount))
        
        if record[1] == 'NCBI_TaxID':
            ncbiTaxaId = record[2]
        elif record[1] == 'GeneID':
            ncbiId = record[2]
        elif record[1] == 'UniProtKB-ID':
            uniprotKbEntry = record[2]
            if not ac2kbMap.has_key(uniprotKbAc):
                ac2kbMap[uniprotKbAc] = uniprotKbEntry
        elif record[1] == 'RefSeq':
            refseq = record[2]
        else:
            continue

        ## skip the XXXX-1 like uniprot ac
        if ac2kbMap.has_key(uniprotKbAc) == False:
            continue

        ## get current key
        uniprotKbEntry = ac2kbMap[uniprotKbAc] 

        ## make new entry if necessary
        if uniprotKbEntry and not toAdd.has_key(uniprotKbEntry):

            ## queue entries in blocks
            totalRecords += 1 
        
            if totalRecords % 100000 == 0:
                queue_entries(toAdd,geneIdMap,taxonIdMap,engine)
                toAdd,ac2kbMap = {},{}


            ac2kbMap[uniprotKbAc] = uniprotKbEntry
            toAdd[uniprotKbEntry] = {'ncbi-taxa-id':None,
                                     'gene-id':None,
                                     'uniprot-ac':set([]),
                                     'refseq':set([])}
        
        ## populate uniprot dictionary
        toAdd[uniprotKbEntry]['uniprot-ac'].update([uniprotKbAc])

        if ncbiTaxaId:
            toAdd[uniprotKbEntry]['ncbi-taxa-id'] = ncbiTaxaId
        elif ncbiId:
            toAdd[uniprotKbEntry]['gene-id'] = ncbiId
        elif refseq:
            toAdd[uniprotKbEntry]['refseq'].update([refseq])

    ## queue any remaining
    if len(toAdd.keys()) > 0:
        queue_entries(toAdd,geneIdMap,taxonIdMap,engine)

    ## clean up
    idmappingFid.close()
    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique uniprot entries were added."%totalRecords
    return timeStr,addedStr
Ejemplo n.º 4
0
def populate_uniprot_table(lineCount, session, engine):
    """
    populate the uniprot table with entries from idmappings
    """

    config = Configure()
    taxaList = config.log['taxa']
    timeStart = time.time()
    totalLines, totalRecords = 0, 0
    idmappingFile = get_idmapping_file()
    idmappingFid = open(idmappingFile, 'rb')
    reader = csv.reader(idmappingFid, delimiter="\t")
    ac2kbMap, toAdd = {}, {}
    wayPoints = [round(int(w)) for w in np.linspace(0, lineCount, 20)]

    print("getting mappers...")
    geneIdMap = gene_mapper(session)
    taxonIdMap = taxa_mapper(session)
    print("mappers loaded... %s" %
          time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))

    def queue_entries(toAdd, geneIdMap, taxonIdMap, engine):

        toCommit = []

        for uniprotKbEntry, entry in toAdd.iteritems():
            db_gene_id = None
            db_taxa_id = None
            db_gene_taxa_id = None

            ## convert the gene id to a database key (check old names if we cannot find it)
            if entry['gene-id'] == None:
                pass
            elif geneIdMap.has_key(entry['gene-id']):
                db_gene_id = geneIdMap[entry['gene-id']]
            elif not geneIdMap.has_key(entry['gene-id']):
                _geneIds = [
                    re.sub("\s+", "", _ncid)
                    for _ncid in entry['gene-id'].split(";")
                ]
                db_gene_id = None

                for _gid in _geneIds:
                    if geneIdMap.has_key(_gid):
                        db_gene_id = _gid

            ## convert the taxa id to a database key
            if entry['ncbi-taxa-id'] and taxonIdMap.has_key(
                    entry['ncbi-taxa-id']):
                db_taxa_id = taxonIdMap[entry['ncbi-taxa-id']]

            ## check that the linked gene taxa is the same as the entry taxa
            #if db_gene_id:
            #    db_gene_taxa_id = session.query(Gene).filter_by(id=db_gene_id).first().taxa_id
            #if db_taxa_id and db_gene_id:
            #    if db_taxa_id != db_gene_taxa_id:
            #        print("WARNING: two taxa present in single uniprot entry? %s %s "%(uniprotKbEntry,\
            #                                                                           entry['gene-id']))

            ## if no taxa was provdied use the one assocated with the linked gene
            #if not db_taxa_id:
            #    dgeneQuery = session.query(Gene).filter_by(id=db_gene_id).first()
            #    if dgeneQuery:
            #        db_taxa_id = dgeneQuery.taxa_id

            ## ensure we are in appropriate taxa
            if entry['ncbi-taxa-id'] not in taxaList:
                continue

            ## ready the uniprot-ac and refseq rows
            entry['uniprot-ac'] = list(entry['uniprot-ac'])
            if len(entry['uniprot-ac']) == 0:
                entry['uniprot-ac'] = None
            elif len(entry['uniprot-ac']) == 1:
                entry['uniprot-ac'] = entry['uniprot-ac'][0]
            else:
                entry['uniprot-ac'] = ";".join(entry['uniprot-ac'])

            entry['refseq'] = list(entry['refseq'])
            if len(entry['refseq']) == 0:
                entry['refseq'] = None
            elif len(entry['refseq']) == 1:
                entry['refseq'] = entry['refseq'][0]
            else:
                entry['refseq'] = ";".join(entry['refseq'])

            ## commit to db
            toCommit.append({
                'uniprot_ac': entry['uniprot-ac'],
                'uniprot_entry': uniprotKbEntry,
                'refseq': entry['refseq'],
                'taxa_id': db_taxa_id,
                'gene_id': db_gene_id
            })
        if len(toCommit) > 0:
            with engine.begin() as connection:
                connection.execute(Uniprot.__table__.insert().values(toCommit))

    ## parse the idmapping file into the db
    for record in reader:

        if len(record) != 3:
            continue

        uniprotKbAc, uniprotKbEntry, ncbiId, refseq, ncbiTaxaId = None, None, None, None, None
        uniprotKbAc = record[0]
        totalLines += 1
        if totalLines in wayPoints:
            print("\t%s / %s" % (totalLines, lineCount))

        if record[1] == 'NCBI_TaxID':
            ncbiTaxaId = record[2]
        elif record[1] == 'GeneID':
            ncbiId = record[2]
        elif record[1] == 'UniProtKB-ID':
            uniprotKbEntry = record[2]
            if not ac2kbMap.has_key(uniprotKbAc):
                ac2kbMap[uniprotKbAc] = uniprotKbEntry
        elif record[1] == 'RefSeq':
            refseq = record[2]
        else:
            continue

        ## skip the XXXX-1 like uniprot ac
        if ac2kbMap.has_key(uniprotKbAc) == False:
            continue

        ## get current key
        uniprotKbEntry = ac2kbMap[uniprotKbAc]

        ## make new entry if necessary
        if uniprotKbEntry and not toAdd.has_key(uniprotKbEntry):

            ## queue entries in blocks
            totalRecords += 1

            if totalRecords % 100000 == 0:
                queue_entries(toAdd, geneIdMap, taxonIdMap, engine)
                toAdd, ac2kbMap = {}, {}

            ac2kbMap[uniprotKbAc] = uniprotKbEntry
            toAdd[uniprotKbEntry] = {
                'ncbi-taxa-id': None,
                'gene-id': None,
                'uniprot-ac': set([]),
                'refseq': set([])
            }

        ## populate uniprot dictionary
        toAdd[uniprotKbEntry]['uniprot-ac'].update([uniprotKbAc])

        if ncbiTaxaId:
            toAdd[uniprotKbEntry]['ncbi-taxa-id'] = ncbiTaxaId
        elif ncbiId:
            toAdd[uniprotKbEntry]['gene-id'] = ncbiId
        elif refseq:
            toAdd[uniprotKbEntry]['refseq'].update([refseq])

    ## queue any remaining
    if len(toAdd.keys()) > 0:
        queue_entries(toAdd, geneIdMap, taxonIdMap, engine)

    ## clean up
    idmappingFid.close()
    timeStr = "...total time taken: %s" % time.strftime(
        '%H:%M:%S', time.gmtime(time.time() - timeStart))
    addedStr = "...%s unique uniprot entries were added." % totalRecords
    return timeStr, addedStr