Example #1
0
                     for uquery in upQueries])
upEntry2Taxa = dict([(str(uquery['uniprot_entry']), str(uquery['taxa_id']))
                     for uquery in upQueries])
print("Test 1: %s" %
      time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))

## using direct table access method
timeStart = time.time()
results = conn.execute(
    Uniprot.__table__.select(Uniprot.uniprot_entry.in_(uniprotEntries)))
upEntry2Gene, upEntry2Taxa = {}, {}
for row in results:
    upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id)
    upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id)

for key, item in upEntry2Gene.iteritems():
    print key, item
print("Test 2: %s" %
      time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))

## using htsint's mapper
timeStart = time.time()
uMapper = uniprot_mapper(session,
                         uniprotIdList=uniprotEntries,
                         gene=True,
                         taxa=True)
print("Test 3: %s" %
      time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))
print(len(uMapper.keys()))
print(uMapper[uniprotEntries[0]])
Example #2
0
                  "ACTN4_RAT","ACTN4_MOUSE","ACTN1_CHICK","ACTN_DROME","ACTN1_RAT"
                  "ACTN1_MOUSE","ACTN3_MOUSE","SPTCB_DROME","ACTN2_MOUSE","ACTN2_CHICK"]

## using select method
timeStart = time.time()
s = select([Uniprot.uniprot_entry,Uniprot.taxa_id,Uniprot.gene_id]).where(Uniprot.uniprot_entry.in_(uniprotEntries))
_upQueries = conn.execute(s)
upQueries = _upQueries.fetchall()
upEntry2Gene = dict([(str(uquery['uniprot_entry']),str(uquery['gene_id'])) for uquery in upQueries])
upEntry2Taxa = dict([(str(uquery['uniprot_entry']),str(uquery['taxa_id'])) for uquery in upQueries])
print("Test 1: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

## using direct table access method
timeStart = time.time()
results = conn.execute(Uniprot.__table__.select(Uniprot.uniprot_entry.in_(uniprotEntries)))
upEntry2Gene, upEntry2Taxa = {},{}
for row in results:
    upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id)
    upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id)

for key, item in upEntry2Gene.iteritems():
    print key, item
print("Test 2: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

## using htsint's mapper
timeStart = time.time()
uMapper = uniprot_mapper(session,uniprotIdList=uniprotEntries,gene=True,taxa=True)
print("Test 3: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
print(len(uMapper.keys()))
print(uMapper[uniprotEntries[0]])
Example #3
0
    def create_summarized(self,
                          parsedFilePath,
                          summaryFilePath=None,
                          large=False,
                          uniprot=False,
                          species=None,
                          taxaList=[],
                          hit2gene=None):
        """
        htsint uses output 5 (XML) and then parses it into a simple csv file
        large - use True if the parsed file as more than a few hundred hits
        uniprot - use True if the target database used is a Uniprot database
        species - if None htsint will try to find the species otherwise the scientific name is given
        taxaList - assign when BLAST is against a db other than uniprot
        hit2gene - a dictionary of keys that match the hits with values that match ncbi gene ids
        """

        ## error checking
        if not os.path.exists(parsedFilePath):
            raise Exception("cannot find parsed file")

        if not uniprot and len(taxaList) == 0:
            raise Exception(
                "databases other than Uniprot must have a taxaList")

        if summaryFilePath == None:
            summaryFilePath = re.sub("\.csv", "",
                                     parsedFilePath) + "_summary.csv"

        ## input/output
        timeStart = time.time()
        fidin = open(parsedFilePath, 'r')
        reader = csv.reader(fidin)
        header = reader.__next__()

        ## prepare out file
        fidout = open(summaryFilePath, 'w')
        writer = csv.writer(fidout)
        writer.writerow([
            "queryId", "hitId", "hitNcbiId", "hitSpecies", "hitSpeciesNcbiId",
            "e-value"
        ])

        ## read through the blast file and extract a unique list of ids
        hitEntries = set([])
        for linja in reader:
            hitIdLong = linja[2]
            _hitId = hitIdLong.split(" ")[1].split("|")
            if _hitId[-1] == '':
                hitId = _hitId[-2]
            else:
                hitId = _hitId[-1]

            hitEntries.update([hitId])

        fidin.close()
        hitEntries = list(hitEntries)

        if uniprot:
            print(
                "batch querying %s UniProt entries in the database... this may take some time"
                % (len(hitEntries)))
            upEntry2Gene, upEntry2Taxa = {}, {}
            if large == False:
                results = self.conn.execute(
                    Uniprot.__table__.select(
                        Uniprot.uniprot_entry.in_(hitEntries)))
                for row in results:
                    upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id)
                    upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id)
            else:
                ## using htsint's mapper
                uMapper = uniprot_mapper(self.session,
                                         uniprotIdList=hitEntries,
                                         gene=True,
                                         taxa=True)
                for key, item in uMapper.items():
                    upEntry2Gene[str(key)] = str(item['gene_id'])
                    upEntry2Taxa[str(key)] = str(item['taxa_id'])

            ## query the taxa just to double check
            taxaList = list(set(upEntry2Taxa.values()))
            while 'None' in taxaList:
                taxaList.remove('None')
            s = select([Taxon.id, Taxon.ncbi_id, Taxon.name
                        ]).where(Taxon.id.in_([int(tid) for tid in taxaList]))
            _taxaQueries = self.conn.execute(s)
            taxaQueries = _taxaQueries.fetchall()
            taxaId2Ncbi = dict([(str(tquery['id']), str(tquery['ncbi_id']))
                                for tquery in taxaQueries])

        ## create a single dictionary of all gene information
        gene2id = {}
        for taxaDbId in taxaList:
            s = select([Gene.id, Gene.ncbi_id], Gene.taxa_id == taxaDbId)
            _geneQueries = self.conn.execute(s)
            taxaDict = dict([(str(r['id']), str(r['ncbi_id']))
                             for r in _geneQueries.fetchall()])
            #print("there are  %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id']))
            gene2id.update(taxaDict)

        ## read through the file again
        fidin = open(parsedFilePath, 'r')
        reader = csv.reader(fidin)
        header = reader.__next__()

        for linja in reader:
            query = linja[0]
            hitIdShort = linja[1]
            hitIdLong = linja[2]
            eScore = linja[3]
            bitScore = linja[4]
            queryId = query.split(" ")[0]

            _hitId = hitIdLong.split(" ")[1].split("|")
            if _hitId[-1] == '':
                hitId = _hitId[-2]
            else:
                hitId = _hitId[-1]

            hitNcbiId, hitSpeciesNcbiId = '-', '-'

            ## extract species id
            if species:
                hitSpecies = species
            elif re.search("OS=.+[A-Z]=", hitIdLong):
                hitSpecies = re.findall("OS=.+[A-Z]=", hitIdLong)[0][3:-4]
            elif re.search("\[.+\]", hitIdLong):
                hitSpecies = re.findall("\[.+\]", hitIdLong)[0][1:-1]
            else:
                print("WARNING: cannot find hitSpecies\n%s" % hitIdLong)
                hitSpecies = '-'

            ## clean species name
            if re.findall("[A-Z]=", hitSpecies):
                hitSpecies = hitSpecies[:re.search("[A-Z]=", hitSpecies).start(
                ) - 2]

            if uniprot and hitId in upEntry2Gene and str(
                    upEntry2Gene[hitId]) != 'None':
                if upEntry2Gene[hitId] in gene2id and str(
                        gene2id[upEntry2Gene[hitId]]) != 'None':
                    hitNcbiId = gene2id[upEntry2Gene[hitId]]

            if hit2gene and hitId in hit2gene:
                hitNcbiId = hit2gene[hitId]

            ## get taxa id associated with uniprot id
            if uniprot:
                hitSpeciesNcbiId = '-'
                if hitId in upEntry2Taxa and str(
                        upEntry2Taxa[hitId]) != 'None':
                    if upEntry2Taxa[hitId] in taxaId2Ncbi and str(
                            upEntry2Taxa[hitId]) != 'None':
                        hitSpeciesNcbiId = taxaId2Ncbi[upEntry2Taxa[hitId]]
                    else:
                        print('Were in! but taxaId2Ncbi does not have',
                              upEntry2Taxa[hitId])
            elif len(taxaList) == 1:
                hitSpeciesNcbiId = taxaList[0]

            writer.writerow([
                queryId, hitId, hitNcbiId, hitSpecies, hitSpeciesNcbiId, eScore
            ])

        print("blast summarize: %s" %
              time.strftime('%H:%M:%S', time.gmtime(time.time() - timeStart)))
        fidin.close()
        fidout.close()

        return summaryFilePath
Example #4
0
    def create_summarized(self,parsedFilePath,summaryFilePath=None,large=False,uniprot=False,species=None,
                          taxaList=[],hit2gene=None):
        """
        htsint uses output 5 (XML) and then parses it into a simple csv file
        large - use True if the parsed file as more than a few hundred hits
        uniprot - use True if the target database used is a Uniprot database
        species - if None htsint will try to find the species otherwise the scientific name is given
        taxaList - assign when BLAST is against a db other than uniprot
        hit2gene - a dictionary of keys that match the hits with values that match ncbi gene ids
        """

        ## error checking
        if not os.path.exists(parsedFilePath):
            raise Exception("cannot find parsed file")

        if not uniprot and len(taxaList) == 0:
            raise Exception("databases other than Uniprot must have a taxaList")    
        
        if summaryFilePath == None:
            summaryFilePath = re.sub("\.csv","",parsedFilePath)+ "_summary.csv"

        ## input/output
        timeStart = time.time()
        fidin = open(parsedFilePath,'r')
        reader = csv.reader(fidin)
        header = reader.__next__()

        ## prepare out file         
        fidout = open(summaryFilePath,'w')
        writer = csv.writer(fidout)
        writer.writerow(["queryId","hitId","hitNcbiId","hitSpecies","hitSpeciesNcbiId","e-value"])

        ## read through the blast file and extract a unique list of ids 
        hitEntries = set([])
        for linja in reader:
            hitIdLong = linja[2]
            _hitId  = hitIdLong.split(" ")[1].split("|")
            if _hitId[-1] == '':
                hitId = _hitId[-2]
            else:
                hitId = _hitId[-1]

            hitEntries.update([hitId])

        fidin.close()    
        hitEntries = list(hitEntries)

        if uniprot:
            print("batch querying %s UniProt entries in the database... this may take some time"%(len(hitEntries)))
            upEntry2Gene, upEntry2Taxa = {},{}
            if large == False:
                results = self.conn.execute(Uniprot.__table__.select(Uniprot.uniprot_entry.in_(hitEntries)))
                for row in results:
                    upEntry2Gene[str(row.uniprot_entry)] = str(row.gene_id)
                    upEntry2Taxa[str(row.uniprot_entry)] = str(row.taxa_id)
            else:
                ## using htsint's mapper
                uMapper = uniprot_mapper(self.session,uniprotIdList=hitEntries,gene=True,taxa=True)
                for key,item in uMapper.items():
                    upEntry2Gene[str(key)] = str(item['gene_id'])
                    upEntry2Taxa[str(key)] = str(item['taxa_id'])
       
            ## query the taxa just to double check
            taxaList = list(set(upEntry2Taxa.values()))
            while 'None' in taxaList: taxaList.remove('None')
            s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.id.in_([int(tid) for tid in taxaList]))
            _taxaQueries = self.conn.execute(s)
            taxaQueries = _taxaQueries.fetchall()
            taxaId2Ncbi = dict([(str(tquery['id']),str(tquery['ncbi_id'])) for tquery in taxaQueries])

        ## create a single dictionary of all gene information 
        gene2id = {}
        for taxaDbId in taxaList:
            s = select([Gene.id,Gene.ncbi_id],Gene.taxa_id==taxaDbId)
            _geneQueries = self.conn.execute(s)
            taxaDict = dict([(str(r['id']),str(r['ncbi_id'])) for r in _geneQueries.fetchall()])
            #print("there are  %s genes from %s (%s)"%(len(taxaDict.keys()),tquery['name'],tquery['ncbi_id']))
            gene2id.update(taxaDict)

        ## read through the file again
        fidin = open(parsedFilePath,'r')
        reader = csv.reader(fidin)
        header = reader.__next__()

        for linja in reader:
            query = linja[0]
            hitIdShort = linja[1]
            hitIdLong = linja[2]
            eScore = linja[3]
            bitScore = linja[4]
            queryId = query.split(" ")[0]

            _hitId  = hitIdLong.split(" ")[1].split("|")
            if _hitId[-1] == '':
                hitId = _hitId[-2]
            else:
                hitId = _hitId[-1]
            
            hitNcbiId,hitSpeciesNcbiId = '-','-'

            ## extract species id
            if species:
                hitSpecies = species
            elif re.search("OS=.+[A-Z]=",hitIdLong):
                hitSpecies = re.findall("OS=.+[A-Z]=",hitIdLong)[0][3:-4]
            elif re.search("\[.+\]",hitIdLong):
                hitSpecies = re.findall("\[.+\]",hitIdLong)[0][1:-1]
            else:
                print("WARNING: cannot find hitSpecies\n%s"%hitIdLong)
                hitSpecies = '-'
            
            ## clean species name    
            if re.findall("[A-Z]=",hitSpecies):
                hitSpecies = hitSpecies[:re.search("[A-Z]=",hitSpecies).start()-2]

            if uniprot and hitId in upEntry2Gene and str(upEntry2Gene[hitId]) != 'None':
                if upEntry2Gene[hitId] in gene2id and str(gene2id[upEntry2Gene[hitId]]) != 'None':
                    hitNcbiId = gene2id[upEntry2Gene[hitId]]

            if hit2gene and hitId in hit2gene:
                hitNcbiId = hit2gene[hitId]

            ## get taxa id associated with uniprot id
            if uniprot:
                hitSpeciesNcbiId = '-'
                if hitId in upEntry2Taxa and str(upEntry2Taxa[hitId]) != 'None':
                    if upEntry2Taxa[hitId] in taxaId2Ncbi and str(upEntry2Taxa[hitId]) != 'None':
                        hitSpeciesNcbiId = taxaId2Ncbi[upEntry2Taxa[hitId]]
                    else:
                        print('Were in! but taxaId2Ncbi does not have', upEntry2Taxa[hitId])
            elif len(taxaList) == 1:
                hitSpeciesNcbiId = taxaList[0]
                        
            writer.writerow([queryId,hitId,hitNcbiId,hitSpecies,hitSpeciesNcbiId,eScore])

        print("blast summarize: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
        fidin.close()
        fidout.close()
        
        return summaryFilePath