def populate_go_annotations(totalAnnotations,session,engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations): ## remove invalid term ids if not termIdMap.has_key(goId): queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and not mapper.has_key(uniprotId): return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and not mapper.has_key(geneId): return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if not taxaIdMap.has_key(taxon): ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode, 'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id, 'gene_id':gene_db_id,'taxa_id':taxon_db_id}) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:","",record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|",taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd, uniprotIdMap,ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.next() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd, geneIdMap,ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique go annotation entries were added."%annotationCount return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)
uniprotEntry = record[10] if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] #for i,r in enumerate(record): # print i,r #if annots1 == 300: # sys.exit() if re.search("\-1",record[1]): print record[1] taxon = re.sub("taxon:","",record[12]) if taxon == "" or re.search("\|",taxon): continue annots1 += 1 gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') header = gene2goFid.next() for record in gene2goFid: annots2 += 1 print("__________________") print("Uniprot annotations: %s"%annots1) print("Gene2go annotations: %s"%annots2) print("Total Annotations: %s"%(annots1 + annots2))
uniprotEntry = record[10] if re.search("\|", uniprotEntry): uniprotEntry = re.split("\|", uniprotEntry)[0] #for i,r in enumerate(record): # print i,r #if annots1 == 300: # sys.exit() if re.search("\-1", record[1]): print record[1] taxon = re.sub("taxon:", "", record[12]) if taxon == "" or re.search("\|", taxon): continue annots1 += 1 gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile, 'rU') header = gene2goFid.next() for record in gene2goFid: annots2 += 1 print("__________________") print("Uniprot annotations: %s" % annots1) print("Gene2go annotations: %s" % annots2) print("Total Annotations: %s" % (annots1 + annots2))
def populate_go_annotations(totalAnnotations,session,engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations): ## remove invalid term ids if not goId in termIdMap: queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and uniprotId not in mapper: return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and geneId not in mapper: return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if taxon not in taxaIdMap: ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode, 'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id, 'gene_id':gene_db_id,'taxa_id':taxon_db_id}) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:","",record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|",taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd, uniprotIdMap,ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.__next__() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd, geneIdMap,ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique go annotation entries were added."%annotationCount return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)