def populate_go_annotations(totalAnnotations,session,engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile,'rU') wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations): ## remove invalid term ids if not termIdMap.has_key(goId): queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and not mapper.has_key(uniprotId): return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and not mapper.has_key(geneId): return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if not taxaIdMap.has_key(taxon): ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode, 'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id, 'gene_id':gene_db_id,'taxa_id':taxon_db_id}) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:","",record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|",uniprotEntry): uniprotEntry = re.split("\|",uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|",taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd, uniprotIdMap,ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile,'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.next() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s"%(annotationCount,totalAnnotations)) queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd, geneIdMap,ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert(). values(toAdd)) timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique go annotation entries were added."%annotationCount return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)
def populate_uniprot_table(lineCount,session,engine): """ populate the uniprot table with entries from idmappings """ config = Configure() taxaList = config.log['taxa'] timeStart = time.time() totalLines,totalRecords = 0,0 idmappingFile = get_idmapping_file() idmappingFid = open(idmappingFile,'rb') reader = csv.reader(idmappingFid,delimiter="\t") ac2kbMap,toAdd = {},{} wayPoints = [round(int(w)) for w in np.linspace(0,lineCount,20)] print("getting mappers...") geneIdMap = gene_mapper(session) taxonIdMap = taxa_mapper(session) print("mappers loaded... %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) def queue_entries(toAdd,geneIdMap,taxonIdMap,engine): toCommit = [] for uniprotKbEntry, entry in toAdd.iteritems(): db_gene_id = None db_taxa_id = None db_gene_taxa_id = None ## convert the gene id to a database key (check old names if we cannot find it) if entry['gene-id'] == None: pass elif geneIdMap.has_key(entry['gene-id']): db_gene_id = geneIdMap[entry['gene-id']] elif not geneIdMap.has_key(entry['gene-id']): _geneIds = [re.sub("\s+","",_ncid) for _ncid in entry['gene-id'].split(";")] db_gene_id = None for _gid in _geneIds: if geneIdMap.has_key(_gid): db_gene_id= _gid ## convert the taxa id to a database key if entry['ncbi-taxa-id'] and taxonIdMap.has_key(entry['ncbi-taxa-id']): db_taxa_id = taxonIdMap[entry['ncbi-taxa-id']] ## check that the linked gene taxa is the same as the entry taxa #if db_gene_id: # db_gene_taxa_id = session.query(Gene).filter_by(id=db_gene_id).first().taxa_id #if db_taxa_id and db_gene_id: # if db_taxa_id != db_gene_taxa_id: # print("WARNING: two taxa present in single uniprot entry? %s %s "%(uniprotKbEntry,\ # entry['gene-id'])) ## if no taxa was provdied use the one assocated with the linked gene #if not db_taxa_id: # dgeneQuery = session.query(Gene).filter_by(id=db_gene_id).first() # if dgeneQuery: # db_taxa_id = dgeneQuery.taxa_id ## ensure we are in appropriate taxa if entry['ncbi-taxa-id'] not in taxaList: continue ## ready the uniprot-ac and refseq rows entry['uniprot-ac'] = list(entry['uniprot-ac']) if len(entry['uniprot-ac']) == 0: entry['uniprot-ac'] = None elif len(entry['uniprot-ac']) == 1: entry['uniprot-ac'] = entry['uniprot-ac'][0] else: entry['uniprot-ac'] = ";".join(entry['uniprot-ac']) entry['refseq'] = list(entry['refseq']) if len(entry['refseq']) == 0: entry['refseq'] = None elif len(entry['refseq']) == 1: entry['refseq'] = entry['refseq'][0] else: entry['refseq'] = ";".join(entry['refseq']) ## commit to db toCommit.append({'uniprot_ac':entry['uniprot-ac'],'uniprot_entry':uniprotKbEntry, 'refseq':entry['refseq'],'taxa_id':db_taxa_id,'gene_id':db_gene_id}) if len(toCommit) > 0: with engine.begin() as connection: connection.execute(Uniprot.__table__.insert(). values(toCommit)) ## parse the idmapping file into the db for record in reader: if len(record) != 3: continue uniprotKbAc,uniprotKbEntry,ncbiId,refseq,ncbiTaxaId = None,None,None,None,None uniprotKbAc = record[0] totalLines += 1 if totalLines in wayPoints: print("\t%s / %s"%(totalLines,lineCount)) if record[1] == 'NCBI_TaxID': ncbiTaxaId = record[2] elif record[1] == 'GeneID': ncbiId = record[2] elif record[1] == 'UniProtKB-ID': uniprotKbEntry = record[2] if not ac2kbMap.has_key(uniprotKbAc): ac2kbMap[uniprotKbAc] = uniprotKbEntry elif record[1] == 'RefSeq': refseq = record[2] else: continue ## skip the XXXX-1 like uniprot ac if ac2kbMap.has_key(uniprotKbAc) == False: continue ## get current key uniprotKbEntry = ac2kbMap[uniprotKbAc] ## make new entry if necessary if uniprotKbEntry and not toAdd.has_key(uniprotKbEntry): ## queue entries in blocks totalRecords += 1 if totalRecords % 100000 == 0: queue_entries(toAdd,geneIdMap,taxonIdMap,engine) toAdd,ac2kbMap = {},{} ac2kbMap[uniprotKbAc] = uniprotKbEntry toAdd[uniprotKbEntry] = {'ncbi-taxa-id':None, 'gene-id':None, 'uniprot-ac':set([]), 'refseq':set([])} ## populate uniprot dictionary toAdd[uniprotKbEntry]['uniprot-ac'].update([uniprotKbAc]) if ncbiTaxaId: toAdd[uniprotKbEntry]['ncbi-taxa-id'] = ncbiTaxaId elif ncbiId: toAdd[uniprotKbEntry]['gene-id'] = ncbiId elif refseq: toAdd[uniprotKbEntry]['refseq'].update([refseq]) ## queue any remaining if len(toAdd.keys()) > 0: queue_entries(toAdd,geneIdMap,taxonIdMap,engine) ## clean up idmappingFid.close() timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique uniprot entries were added."%totalRecords return timeStr,addedStr
def populate_gene_table(geneInfoCount,session,engine): """ use the geneids derived from the idmapping file along with gene_info data to populate the gene table """ config = Configure() taxaList = config.log['taxa'] timeStart = time.time() toAdd = [] totalRecords = 0 total = geneInfoCount wayPoints = [round(int(w)) for w in np.linspace(0,total,20)] geneInfoFile = os.path.join(config.log['data'],"gene_info.db") geneInfoFid = open(geneInfoFile,'rU') header = geneInfoFid.next() taxaIdMap = taxa_mapper(session) for record in geneInfoFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#",record[0]): continue taxId = record[0] if taxId not in taxaList: continue ncbiId = record[1] symbol = record[2] synonyms = record[4] chromosome = record[6] map_location = record[7] description = record[8] ## define the table entry toAdd.append({'ncbi_id':ncbiId,'description':description,'symbol':symbol,'synonyms':synonyms, 'chromosome':chromosome,'map_location':map_location,'taxa_id':taxId}) totalRecords += 1 if len(toAdd) >= 200000: toRemove = [] for ta in toAdd: if taxaIdMap.has_key(ta['taxa_id']): ta['taxa_id'] = taxaIdMap[ta['taxa_id']] else: toRemove.append(ta) for ta in toRemove: toAdd.remove(ta) if len(toAdd) > 0: with engine.begin() as connection: connection.execute(Gene.__table__.insert(). values(toAdd)) toAdd = [] ## show progress if totalRecords in wayPoints: print("\t%s / %s"%(totalRecords,total)) print('committing changes...') toRemove = [] for ta in toAdd: if taxaIdMap.has_key(ta['taxa_id']): ta['taxa_id'] = taxaIdMap[ta['taxa_id']] else: toRemove.append(ta) for ta in toRemove: toAdd.remove(ta) if len(toAdd) > 0: with engine.begin() as connection: connection.execute(Gene.__table__.insert(). values(toAdd)) ## clean up geneInfoFid.close() timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart)) addedStr = "...%s unique genes were added."%totalRecords return timeStr,addedStr
def populate_go_annotations(totalAnnotations, session, engine): """ read the annotation file into a dictionary This will take some time This function is intended for use with http://www.geneontology.org/GO.format.gaf-2_0.shtml """ timeStart = time.time() config = Configure() taxaList = config.log['taxa'] toAdd = [] annotationFile = get_annotation_file() annotationFid = open(annotationFile, 'rU') wayPoints = [round(int(w)) for w in np.linspace(0, totalAnnotations, 20)] annotationCount = 0 print("...loading mappers") termIdMap = goterm_mapper(session) taxaIdMap = taxa_mapper(session) uniprotIdMap = uniprot_mapper(session) print("...populating rows") def queue_entry(goId, evidenceCode, pubmedRefs, uniprotId, geneId, taxon, toAdd, mapper, ignoredAnnotations): ## remove invalid term ids if not termIdMap.has_key(goId): queryTerm = session.query(GoTerm).filter_by( alternate_id=goId).first() if queryTerm == None: return go_db_id = queryTerm.id else: go_db_id = termIdMap[goId] ## remove invalid uniprot ids if uniprotId and not mapper.has_key(uniprotId): return if uniprotId: uniprot_db_id = mapper[uniprotId] else: uniprot_db_id = None ## remove invalid gene ids if geneId and not mapper.has_key(geneId): return if geneId: gene_db_id = mapper[geneId] else: gene_db_id = None ## ignore annotations that have an outdated taxon if not taxaIdMap.has_key(taxon): ignoredAnnotations += 1 return ## get the taxa foreign key taxon_db_id = taxaIdMap[taxon] toAdd.append({ 'go_term_id': go_db_id, 'evidence_code': evidenceCode, 'pubmed_refs': pubmedRefs, 'uniprot_id': uniprot_db_id, 'gene_id': gene_db_id, 'taxa_id': taxon_db_id }) ## add annotations from uniprot annotation file ignoredAnnotationsUniprot = 0 print("...getting annotations from gene_association (uniprot)") for record in annotationFid: record = record[:-1].split("\t") ## check that it is a uniprot entry if record[0][0] == "!": continue if record[0] != 'UniProtKB': continue uniprotId = record[1] dbObjectSymbol = record[2] goId = record[4] pubmedRefs = record[5] evidenceCode = record[6] aspect = record[8] uniprotEntry = record[10] goTermName = record[11] taxon = re.sub("taxon:", "", record[12]) date = record[13] assignedBy = record[14] if taxon not in taxaList: continue ## parse the uniprot Entry if re.search("\|", uniprotEntry): uniprotEntry = re.split("\|", uniprotEntry)[0] ## ignore annotations with multiple species if re.search("\|", taxon): continue ## update progress annotationCount += 1 if annotationCount in wayPoints: print("\t%s / %s" % (annotationCount, totalAnnotations)) queue_entry(goId, evidenceCode, pubmedRefs, uniprotEntry, None, taxon, toAdd, uniprotIdMap, ignoredAnnotationsUniprot) if len(toAdd) >= 100000: # 100000 with engine.begin() as connection: connection.execute( GoAnnotation.__table__.insert().values(toAdd)) toAdd = [] print('committing final changes...') print('ignored annotations after uniprot... %s' % (ignoredAnnotationsUniprot)) with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert().values(toAdd)) del uniprotIdMap annotationFid.close() ## add annotations from gene2go gene2goFile = get_gene2go_file() gene2goFid = open(gene2goFile, 'rU') ignoredAnnotationsGene = 0 print("...getting annotations from gene2go") header = gene2goFid.next() geneIdMap = gene_mapper(session) toAdd = [] for record in gene2goFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#", record[0]) or len(record) != 8: continue taxon = record[0] ncbiId = record[1] goId = record[2] evidenceCode = record[3] qualifier = record[4] go_term_description = record[5] pubmedRefs = record[6] go_aspect = record[7] annotationCount += 1 if taxon not in taxaList: continue if annotationCount in wayPoints: print("\t%s / %s" % (annotationCount, totalAnnotations)) queue_entry(goId, evidenceCode, pubmedRefs, None, ncbiId, taxon, toAdd, geneIdMap, ignoredAnnotationsGene) if len(toAdd) >= 100000: #100000 with engine.begin() as connection: connection.execute( GoAnnotation.__table__.insert().values(toAdd)) toAdd = [] print('ignored annotations after gene2go... %s' % (ignoredAnnotationsGene)) print('committing final changes...') with engine.begin() as connection: connection.execute(GoAnnotation.__table__.insert().values(toAdd)) timeStr = "...total time taken: %s" % time.strftime( '%H:%M:%S', time.gmtime(time.time() - timeStart)) addedStr = "...%s unique go annotation entries were added." % annotationCount return timeStr, addedStr, (ignoredAnnotationsUniprot, ignoredAnnotationsGene)
def populate_uniprot_table(lineCount, session, engine): """ populate the uniprot table with entries from idmappings """ config = Configure() taxaList = config.log['taxa'] timeStart = time.time() totalLines, totalRecords = 0, 0 idmappingFile = get_idmapping_file() idmappingFid = open(idmappingFile, 'rb') reader = csv.reader(idmappingFid, delimiter="\t") ac2kbMap, toAdd = {}, {} wayPoints = [round(int(w)) for w in np.linspace(0, lineCount, 20)] print("getting mappers...") geneIdMap = gene_mapper(session) taxonIdMap = taxa_mapper(session) print("mappers loaded... %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) def queue_entries(toAdd, geneIdMap, taxonIdMap, engine): toCommit = [] for uniprotKbEntry, entry in toAdd.iteritems(): db_gene_id = None db_taxa_id = None db_gene_taxa_id = None ## convert the gene id to a database key (check old names if we cannot find it) if entry['gene-id'] == None: pass elif geneIdMap.has_key(entry['gene-id']): db_gene_id = geneIdMap[entry['gene-id']] elif not geneIdMap.has_key(entry['gene-id']): _geneIds = [ re.sub("\s+", "", _ncid) for _ncid in entry['gene-id'].split(";") ] db_gene_id = None for _gid in _geneIds: if geneIdMap.has_key(_gid): db_gene_id = _gid ## convert the taxa id to a database key if entry['ncbi-taxa-id'] and taxonIdMap.has_key( entry['ncbi-taxa-id']): db_taxa_id = taxonIdMap[entry['ncbi-taxa-id']] ## check that the linked gene taxa is the same as the entry taxa #if db_gene_id: # db_gene_taxa_id = session.query(Gene).filter_by(id=db_gene_id).first().taxa_id #if db_taxa_id and db_gene_id: # if db_taxa_id != db_gene_taxa_id: # print("WARNING: two taxa present in single uniprot entry? %s %s "%(uniprotKbEntry,\ # entry['gene-id'])) ## if no taxa was provdied use the one assocated with the linked gene #if not db_taxa_id: # dgeneQuery = session.query(Gene).filter_by(id=db_gene_id).first() # if dgeneQuery: # db_taxa_id = dgeneQuery.taxa_id ## ensure we are in appropriate taxa if entry['ncbi-taxa-id'] not in taxaList: continue ## ready the uniprot-ac and refseq rows entry['uniprot-ac'] = list(entry['uniprot-ac']) if len(entry['uniprot-ac']) == 0: entry['uniprot-ac'] = None elif len(entry['uniprot-ac']) == 1: entry['uniprot-ac'] = entry['uniprot-ac'][0] else: entry['uniprot-ac'] = ";".join(entry['uniprot-ac']) entry['refseq'] = list(entry['refseq']) if len(entry['refseq']) == 0: entry['refseq'] = None elif len(entry['refseq']) == 1: entry['refseq'] = entry['refseq'][0] else: entry['refseq'] = ";".join(entry['refseq']) ## commit to db toCommit.append({ 'uniprot_ac': entry['uniprot-ac'], 'uniprot_entry': uniprotKbEntry, 'refseq': entry['refseq'], 'taxa_id': db_taxa_id, 'gene_id': db_gene_id }) if len(toCommit) > 0: with engine.begin() as connection: connection.execute(Uniprot.__table__.insert().values(toCommit)) ## parse the idmapping file into the db for record in reader: if len(record) != 3: continue uniprotKbAc, uniprotKbEntry, ncbiId, refseq, ncbiTaxaId = None, None, None, None, None uniprotKbAc = record[0] totalLines += 1 if totalLines in wayPoints: print("\t%s / %s" % (totalLines, lineCount)) if record[1] == 'NCBI_TaxID': ncbiTaxaId = record[2] elif record[1] == 'GeneID': ncbiId = record[2] elif record[1] == 'UniProtKB-ID': uniprotKbEntry = record[2] if not ac2kbMap.has_key(uniprotKbAc): ac2kbMap[uniprotKbAc] = uniprotKbEntry elif record[1] == 'RefSeq': refseq = record[2] else: continue ## skip the XXXX-1 like uniprot ac if ac2kbMap.has_key(uniprotKbAc) == False: continue ## get current key uniprotKbEntry = ac2kbMap[uniprotKbAc] ## make new entry if necessary if uniprotKbEntry and not toAdd.has_key(uniprotKbEntry): ## queue entries in blocks totalRecords += 1 if totalRecords % 100000 == 0: queue_entries(toAdd, geneIdMap, taxonIdMap, engine) toAdd, ac2kbMap = {}, {} ac2kbMap[uniprotKbAc] = uniprotKbEntry toAdd[uniprotKbEntry] = { 'ncbi-taxa-id': None, 'gene-id': None, 'uniprot-ac': set([]), 'refseq': set([]) } ## populate uniprot dictionary toAdd[uniprotKbEntry]['uniprot-ac'].update([uniprotKbAc]) if ncbiTaxaId: toAdd[uniprotKbEntry]['ncbi-taxa-id'] = ncbiTaxaId elif ncbiId: toAdd[uniprotKbEntry]['gene-id'] = ncbiId elif refseq: toAdd[uniprotKbEntry]['refseq'].update([refseq]) ## queue any remaining if len(toAdd.keys()) > 0: queue_entries(toAdd, geneIdMap, taxonIdMap, engine) ## clean up idmappingFid.close() timeStr = "...total time taken: %s" % time.strftime( '%H:%M:%S', time.gmtime(time.time() - timeStart)) addedStr = "...%s unique uniprot entries were added." % totalRecords return timeStr, addedStr
def populate_gene_table(geneInfoCount, session, engine): """ use the geneids derived from the idmapping file along with gene_info data to populate the gene table """ config = Configure() taxaList = config.log['taxa'] timeStart = time.time() toAdd = [] totalRecords = 0 total = geneInfoCount wayPoints = [round(int(w)) for w in np.linspace(0, total, 20)] geneInfoFile = os.path.join(config.log['data'], "gene_info.db") geneInfoFid = open(geneInfoFile, 'rU') header = geneInfoFid.next() taxaIdMap = taxa_mapper(session) for record in geneInfoFid: record = record.rstrip("\n") record = record.split("\t") if re.search("^\#", record[0]): continue taxId = record[0] if taxId not in taxaList: continue ncbiId = record[1] symbol = record[2] synonyms = record[4] chromosome = record[6] map_location = record[7] description = record[8] ## define the table entry toAdd.append({ 'ncbi_id': ncbiId, 'description': description, 'symbol': symbol, 'synonyms': synonyms, 'chromosome': chromosome, 'map_location': map_location, 'taxa_id': taxId }) totalRecords += 1 if len(toAdd) >= 200000: toRemove = [] for ta in toAdd: if taxaIdMap.has_key(ta['taxa_id']): ta['taxa_id'] = taxaIdMap[ta['taxa_id']] else: toRemove.append(ta) for ta in toRemove: toAdd.remove(ta) if len(toAdd) > 0: with engine.begin() as connection: connection.execute(Gene.__table__.insert().values(toAdd)) toAdd = [] ## show progress if totalRecords in wayPoints: print("\t%s / %s" % (totalRecords, total)) print('committing changes...') toRemove = [] for ta in toAdd: if taxaIdMap.has_key(ta['taxa_id']): ta['taxa_id'] = taxaIdMap[ta['taxa_id']] else: toRemove.append(ta) for ta in toRemove: toAdd.remove(ta) if len(toAdd) > 0: with engine.begin() as connection: connection.execute(Gene.__table__.insert().values(toAdd)) ## clean up geneInfoFid.close() timeStr = "...total time taken: %s" % time.strftime( '%H:%M:%S', time.gmtime(time.time() - timeStart)) addedStr = "...%s unique genes were added." % totalRecords return timeStr, addedStr