예제 #1
0
    def __readBins(self, binFiles):
        bins = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)
            bins[binId] = set(readFastaSeqIds(binFile))

        return bins
예제 #2
0
    def __readBins(self, binFiles):
        bins = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)
            bins[binId] = set(readFastaSeqIds(binFile))

        return bins
예제 #3
0
    def unique(self, binFiles):
        """Check if sequences are assigned to multiple bins."""

        # read seq ids from all bins
        binSeqs = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            binSeqs[binId] = readFastaSeqIds(f)

        # check for sequences assigned to multiple bins
        bDuplicates = False
        binIds = binSeqs.keys()
        for i in xrange(0, len(binIds)):
            for j in xrange(i+1, len(binIds)):
                seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]]))

                if len(seqInter) > 0:
                    bDuplicates = True
                    print '  Sequences shared between %s and %s: ' % (binIds[i], binIds[j])
                    for seqId in seqInter:
                        print '    ' + seqId
                    print ''

        if not bDuplicates:
            print '  No sequences assigned to multiple bins.'
예제 #4
0
파일: ssuFinder.py 프로젝트: ofanoyi/CheckM
    def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        # get bin id of binned contigs
        self.logger.info('  Determining bin assignment of sequences.')
        seqIdToBinId = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqIds = readFastaSeqIds(f)
            for seqId in seqIds:
                seqIdToBinId[seqId] = binId

        # identify 16S reads from contigs/scaffolds
        self.logger.info('  Identifying SSU rRNAs on sequences.')
        self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu'))

        # read HMM hits
        hitsPerDomain = {}
        for domain in ['archaea', 'bacteria', 'euk']:
            hits = {}

            seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold)
            if len(seqInfo) > 0:
                for seqId, seqHits in seqInfo.iteritems():
                    for hit in seqHits:
                        self.__addHit(hits, seqId, hit, concatenateThreshold)

            hitsPerDomain[domain] = hits

        # find best domain hit for each sequence
        bestHits = {}
        for _, hits in hitsPerDomain.iteritems():
            for seqId, info in hits.iteritems():
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                self.__addDomainHit(bestHits, seqId, info)

        # write summary file and putative SSU rRNAs to file
        summaryFile = os.path.join(outputDir, 'ssu_summary.tsv')
        summaryOut = open(summaryFile, 'w')
        summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n')

        seqFile = os.path.join(outputDir, 'ssu.fna')
        seqOut = open(seqFile, 'w')

        seqs = readFasta(contigFile)

        hitsToBins = {}
        for seqId in bestHits:
            origSeqId = seqId
            if '-#' in seqId:
                seqId = seqId[0:seqId.rfind('-#')]

            if seqId in seqIdToBinId:
                binId = seqIdToBinId[seqId]
            else:
                binId = DefaultValues.UNBINNED

            seqInfo = [origSeqId] + bestHits[origSeqId]
            hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo]

        for binId in sorted(hitsToBins.keys()):
            for seqInfo in hitsToBins[binId]:
                seqId = seqInfo[0]
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                seq = seqs[seqId]
                summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n')
                seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n')
                seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n')

        summaryOut.close()
        seqOut.close()

        self.logger.info('')
        self.logger.info('  Identified ' + str(len(bestHits)) + ' putative SSU genes:')
        self.logger.info('    Summary of identified hits written to: ' + summaryFile)
        self.logger.info('    SSU sequences written to: ' + seqFile)
예제 #5
0
    def run(self, contigFile, binFiles, outputDir, evalueThreshold,
            concatenateThreshold):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        # get bin id of binned contigs
        self.logger.info('Determining bin assignment of sequences.')
        seqIdToBinId = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqIds = readFastaSeqIds(f)
            for seqId in seqIds:
                seqIdToBinId[seqId] = binId

        # identify 16S reads from contigs/scaffolds
        self.logger.info('Identifying SSU rRNAs on sequences.')
        self.__hmmSearch(contigFile, evalueThreshold,
                         os.path.join(outputDir, 'ssu'))

        # read HMM hits
        hitsPerDomain = {}
        for domain in ['archaea', 'bacteria', 'euk']:
            hits = {}

            seqInfo = self.__readHits(
                os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain,
                evalueThreshold)
            if len(seqInfo) > 0:
                for seqId, seqHits in seqInfo.items():
                    for hit in seqHits:
                        self.__addHit(hits, seqId, hit, concatenateThreshold)

            hitsPerDomain[domain] = hits

        # find best domain hit for each sequence
        bestHits = {}
        for _, hits in hitsPerDomain.items():
            for seqId, info in hits.items():
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                self.__addDomainHit(bestHits, seqId, info)

        # write summary file and putative SSU rRNAs to file
        summaryFile = os.path.join(outputDir, 'ssu_summary.tsv')
        summaryOut = open(summaryFile, 'w')
        summaryOut.write(
            'Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n'
        )

        seqFile = os.path.join(outputDir, 'ssu.fna')
        seqOut = open(seqFile, 'w')

        seqs = readFasta(contigFile)

        hitsToBins = {}
        for seqId in bestHits:
            origSeqId = seqId
            if '-#' in seqId:
                seqId = seqId[0:seqId.rfind('-#')]

            if seqId in seqIdToBinId:
                binId = seqIdToBinId[seqId]
            else:
                binId = DefaultValues.UNBINNED

            seqInfo = [origSeqId] + bestHits[origSeqId]
            hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo]

        for binId in sorted(hitsToBins.keys()):
            for seqInfo in hitsToBins[binId]:
                seqId = seqInfo[0]
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                seq = seqs[seqId]
                summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' +
                                 str(len(seq)) + '\n')
                seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR +
                             seqInfo[0] + '\n')
                seqOut.write(seq[int(seqInfo[3]) + 1:int(seqInfo[4]) + 1] +
                             '\n')

        summaryOut.close()
        seqOut.close()

        self.logger.info('Identified ' + str(len(bestHits)) +
                         ' putative SSU genes.')
        self.logger.info('Summary of identified hits written to: ' +
                         summaryFile)
        self.logger.info('SSU sequences written to: ' + seqFile)