for uquery in upQueries]) upEntry2Taxa = dict([(str(uquery['uniprot_entry']), str(uquery['taxa_id'])) for uquery in upQueries]) print("Test 1: %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) ## using direct table access method timeStart = time.time() results = conn.execute( Uniprot.__table__.select(Uniprot.uniprot_entry.in_(uniprotEntries))) upEntry2Gene, upEntry2Taxa = {}, {} for row in results: upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id) upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id) for key, item in upEntry2Gene.iteritems(): print key, item print("Test 2: %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) ## using htsint's mapper timeStart = time.time() uMapper = uniprot_mapper(session, uniprotIdList=uniprotEntries, gene=True, taxa=True) print("Test 3: %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) print(len(uMapper.keys())) print(uMapper[uniprotEntries[0]])
"ACTN4_RAT","ACTN4_MOUSE","ACTN1_CHICK","ACTN_DROME","ACTN1_RAT" "ACTN1_MOUSE","ACTN3_MOUSE","SPTCB_DROME","ACTN2_MOUSE","ACTN2_CHICK"] ## using select method timeStart = time.time() s = select([Uniprot.uniprot_entry,Uniprot.taxa_id,Uniprot.gene_id]).where(Uniprot.uniprot_entry.in_(uniprotEntries)) _upQueries = conn.execute(s) upQueries = _upQueries.fetchall() upEntry2Gene = dict([(str(uquery['uniprot_entry']),str(uquery['gene_id'])) for uquery in upQueries]) upEntry2Taxa = dict([(str(uquery['uniprot_entry']),str(uquery['taxa_id'])) for uquery in upQueries]) print("Test 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) ## using direct table access method timeStart = time.time() results = conn.execute(Uniprot.__table__.select(Uniprot.uniprot_entry.in_(uniprotEntries))) upEntry2Gene, upEntry2Taxa = {},{} for row in results: upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id) upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id) for key, item in upEntry2Gene.iteritems(): print key, item print("Test 2: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) ## using htsint's mapper timeStart = time.time() uMapper = uniprot_mapper(session,uniprotIdList=uniprotEntries,gene=True,taxa=True) print("Test 3: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) print(len(uMapper.keys())) print(uMapper[uniprotEntries[0]])
def create_summarized(self, parsedFilePath, summaryFilePath=None, large=False, uniprot=False, species=None, taxaList=[], hit2gene=None): """ htsint uses output 5 (XML) and then parses it into a simple csv file large - use True if the parsed file as more than a few hundred hits uniprot - use True if the target database used is a Uniprot database species - if None htsint will try to find the species otherwise the scientific name is given taxaList - assign when BLAST is against a db other than uniprot hit2gene - a dictionary of keys that match the hits with values that match ncbi gene ids """ ## error checking if not os.path.exists(parsedFilePath): raise Exception("cannot find parsed file") if not uniprot and len(taxaList) == 0: raise Exception( "databases other than Uniprot must have a taxaList") if summaryFilePath == None: summaryFilePath = re.sub("\.csv", "", parsedFilePath) + "_summary.csv" ## input/output timeStart = time.time() fidin = open(parsedFilePath, 'r') reader = csv.reader(fidin) header = reader.__next__() ## prepare out file fidout = open(summaryFilePath, 'w') writer = csv.writer(fidout) writer.writerow([ "queryId", "hitId", "hitNcbiId", "hitSpecies", "hitSpeciesNcbiId", "e-value" ]) ## read through the blast file and extract a unique list of ids hitEntries = set([]) for linja in reader: hitIdLong = linja[2] _hitId = hitIdLong.split(" ")[1].split("|") if _hitId[-1] == '': hitId = _hitId[-2] else: hitId = _hitId[-1] hitEntries.update([hitId]) fidin.close() hitEntries = list(hitEntries) if uniprot: print( "batch querying %s UniProt entries in the database... this may take some time" % (len(hitEntries))) upEntry2Gene, upEntry2Taxa = {}, {} if large == False: results = self.conn.execute( Uniprot.__table__.select( Uniprot.uniprot_entry.in_(hitEntries))) for row in results: upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id) upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id) else: ## using htsint's mapper uMapper = uniprot_mapper(self.session, uniprotIdList=hitEntries, gene=True, taxa=True) for key, item in uMapper.items(): upEntry2Gene[str(key)] = str(item['gene_id']) upEntry2Taxa[str(key)] = str(item['taxa_id']) ## query the taxa just to double check taxaList = list(set(upEntry2Taxa.values())) while 'None' in taxaList: taxaList.remove('None') s = select([Taxon.id, Taxon.ncbi_id, Taxon.name ]).where(Taxon.id.in_([int(tid) for tid in taxaList])) _taxaQueries = self.conn.execute(s) taxaQueries = _taxaQueries.fetchall() taxaId2Ncbi = dict([(str(tquery['id']), str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a single dictionary of all gene information gene2id = {} for taxaDbId in taxaList: s = select([Gene.id, Gene.ncbi_id], Gene.taxa_id == taxaDbId) _geneQueries = self.conn.execute(s) taxaDict = dict([(str(r['id']), str(r['ncbi_id'])) for r in _geneQueries.fetchall()]) #print("there are %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id'])) gene2id.update(taxaDict) ## read through the file again fidin = open(parsedFilePath, 'r') reader = csv.reader(fidin) header = reader.__next__() for linja in reader: query = linja[0] hitIdShort = linja[1] hitIdLong = linja[2] eScore = linja[3] bitScore = linja[4] queryId = query.split(" ")[0] _hitId = hitIdLong.split(" ")[1].split("|") if _hitId[-1] == '': hitId = _hitId[-2] else: hitId = _hitId[-1] hitNcbiId, hitSpeciesNcbiId = '-', '-' ## extract species id if species: hitSpecies = species elif re.search("OS=.+[A-Z]=", hitIdLong): hitSpecies = re.findall("OS=.+[A-Z]=", hitIdLong)[0][3:-4] elif re.search("\[.+\]", hitIdLong): hitSpecies = re.findall("\[.+\]", hitIdLong)[0][1:-1] else: print("WARNING: cannot find hitSpecies\n%s" % hitIdLong) hitSpecies = '-' ## clean species name if re.findall("[A-Z]=", hitSpecies): hitSpecies = hitSpecies[:re.search("[A-Z]=", hitSpecies).start( ) - 2] if uniprot and hitId in upEntry2Gene and str( upEntry2Gene[hitId]) != 'None': if upEntry2Gene[hitId] in gene2id and str( gene2id[upEntry2Gene[hitId]]) != 'None': hitNcbiId = gene2id[upEntry2Gene[hitId]] if hit2gene and hitId in hit2gene: hitNcbiId = hit2gene[hitId] ## get taxa id associated with uniprot id if uniprot: hitSpeciesNcbiId = '-' if hitId in upEntry2Taxa and str( upEntry2Taxa[hitId]) != 'None': if upEntry2Taxa[hitId] in taxaId2Ncbi and str( upEntry2Taxa[hitId]) != 'None': hitSpeciesNcbiId = taxaId2Ncbi[upEntry2Taxa[hitId]] else: print('Were in! but taxaId2Ncbi does not have', upEntry2Taxa[hitId]) elif len(taxaList) == 1: hitSpeciesNcbiId = taxaList[0] writer.writerow([ queryId, hitId, hitNcbiId, hitSpecies, hitSpeciesNcbiId, eScore ]) print("blast summarize: %s" % time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart))) fidin.close() fidout.close() return summaryFilePath
def create_summarized(self,parsedFilePath,summaryFilePath=None,large=False,uniprot=False,species=None, taxaList=[],hit2gene=None): """ htsint uses output 5 (XML) and then parses it into a simple csv file large - use True if the parsed file as more than a few hundred hits uniprot - use True if the target database used is a Uniprot database species - if None htsint will try to find the species otherwise the scientific name is given taxaList - assign when BLAST is against a db other than uniprot hit2gene - a dictionary of keys that match the hits with values that match ncbi gene ids """ ## error checking if not os.path.exists(parsedFilePath): raise Exception("cannot find parsed file") if not uniprot and len(taxaList) == 0: raise Exception("databases other than Uniprot must have a taxaList") if summaryFilePath == None: summaryFilePath = re.sub("\.csv","",parsedFilePath)+ "_summary.csv" ## input/output timeStart = time.time() fidin = open(parsedFilePath,'r') reader = csv.reader(fidin) header = reader.__next__() ## prepare out file fidout = open(summaryFilePath,'w') writer = csv.writer(fidout) writer.writerow(["queryId","hitId","hitNcbiId","hitSpecies","hitSpeciesNcbiId","e-value"]) ## read through the blast file and extract a unique list of ids hitEntries = set([]) for linja in reader: hitIdLong = linja[2] _hitId = hitIdLong.split(" ")[1].split("|") if _hitId[-1] == '': hitId = _hitId[-2] else: hitId = _hitId[-1] hitEntries.update([hitId]) fidin.close() hitEntries = list(hitEntries) if uniprot: print("batch querying %s UniProt entries in the database... this may take some time"%(len(hitEntries))) upEntry2Gene, upEntry2Taxa = {},{} if large == False: results = self.conn.execute(Uniprot.__table__.select(Uniprot.uniprot_entry.in_(hitEntries))) for row in results: upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id) upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id) else: ## using htsint's mapper uMapper = uniprot_mapper(self.session,uniprotIdList=hitEntries,gene=True,taxa=True) for key,item in uMapper.items(): upEntry2Gene[str(key)] = str(item['gene_id']) upEntry2Taxa[str(key)] = str(item['taxa_id']) ## query the taxa just to double check taxaList = list(set(upEntry2Taxa.values())) while 'None' in taxaList: taxaList.remove('None') s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.id.in_([int(tid) for tid in taxaList])) _taxaQueries = self.conn.execute(s) taxaQueries = _taxaQueries.fetchall() taxaId2Ncbi = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries]) ## create a single dictionary of all gene information gene2id = {} for taxaDbId in taxaList: s = select([Gene.id,Gene.ncbi_id],Gene.taxa_id==taxaDbId) _geneQueries = self.conn.execute(s) taxaDict = dict([(str(r['id']),str(r['ncbi_id'])) for r in _geneQueries.fetchall()]) #print("there are %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id'])) gene2id.update(taxaDict) ## read through the file again fidin = open(parsedFilePath,'r') reader = csv.reader(fidin) header = reader.__next__() for linja in reader: query = linja[0] hitIdShort = linja[1] hitIdLong = linja[2] eScore = linja[3] bitScore = linja[4] queryId = query.split(" ")[0] _hitId = hitIdLong.split(" ")[1].split("|") if _hitId[-1] == '': hitId = _hitId[-2] else: hitId = _hitId[-1] hitNcbiId,hitSpeciesNcbiId = '-','-' ## extract species id if species: hitSpecies = species elif re.search("OS=.+[A-Z]=",hitIdLong): hitSpecies = re.findall("OS=.+[A-Z]=",hitIdLong)[0][3:-4] elif re.search("\[.+\]",hitIdLong): hitSpecies = re.findall("\[.+\]",hitIdLong)[0][1:-1] else: print("WARNING: cannot find hitSpecies\n%s"%hitIdLong) hitSpecies = '-' ## clean species name if re.findall("[A-Z]=",hitSpecies): hitSpecies = hitSpecies[:re.search("[A-Z]=",hitSpecies).start()-2] if uniprot and hitId in upEntry2Gene and str(upEntry2Gene[hitId]) != 'None': if upEntry2Gene[hitId] in gene2id and str(gene2id[upEntry2Gene[hitId]]) != 'None': hitNcbiId = gene2id[upEntry2Gene[hitId]] if hit2gene and hitId in hit2gene: hitNcbiId = hit2gene[hitId] ## get taxa id associated with uniprot id if uniprot: hitSpeciesNcbiId = '-' if hitId in upEntry2Taxa and str(upEntry2Taxa[hitId]) != 'None': if upEntry2Taxa[hitId] in taxaId2Ncbi and str(upEntry2Taxa[hitId]) != 'None': hitSpeciesNcbiId = taxaId2Ncbi[upEntry2Taxa[hitId]] else: print('Were in! but taxaId2Ncbi does not have', upEntry2Taxa[hitId]) elif len(taxaList) == 1: hitSpeciesNcbiId = taxaList[0] writer.writerow([queryId,hitId,hitNcbiId,hitSpecies,hitSpeciesNcbiId,eScore]) print("blast summarize: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart))) fidin.close() fidout.close() return summaryFilePath