def populate_go_annotations(totalAnnotations,session,engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations): ## remove invalid term ids if not termIdMap.has_key(goId): queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and not mapper.has_key(uniprotId): return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and not mapper.has_key(geneId): return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if not taxaIdMap.has_key(taxon): ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode, 'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id, 'gene_id':gene_db_id,'taxa_id':taxon_db_id}) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:","",record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|",taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd, uniprotIdMap,ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.next() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd, geneIdMap,ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique go annotation entries were added."%annotationCount return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)
#!/usr/bin/python import time,csv,re,sys import numpy as np from htsint.database import get_annotation_file, get_gene2go_file annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') annotsCount = 0 annotatedIds = {} annots1,annots2 = 0,0 for record in annotationFid: record = record[:-1].split("\t") if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotEntry = record[10] if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] #for i,r in enumerate(record): # print i,r #if annots1 == 300: # sys.exit()
#!/usr/bin/python import time, csv, re, sys import numpy as np from htsint.database import get_annotation_file, get_gene2go_file annotationFile = get_annotation_file() annotationFid = open(annotationFile, 'rU') annotsCount = 0 annotatedIds = {} annots1, annots2 = 0, 0 for record in annotationFid: record = record[:-1].split("\t") if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotEntry = record[10] if re.search("\|", uniprotEntry): uniprotEntry = re.split("\|", uniprotEntry)[0] #for i,r in enumerate(record): # print i,r #if annots1 == 300: # sys.exit() if re.search("\-1", record[1]):
def populate_go_annotations(totalAnnotations,session,engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations): ## remove invalid term ids if not goId in termIdMap: queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and uniprotId not in mapper: return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and geneId not in mapper: return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if taxon not in taxaIdMap: ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode, 'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id, 'gene_id':gene_db_id,'taxa_id':taxon_db_id}) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:","",record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|",taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd, uniprotIdMap,ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.__next__() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd, geneIdMap,ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique go annotation entries were added."%annotationCount return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)