Example #1
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Example #2
0
 def __init__(self, binningIndex, completeness, contamination, binFile):
     self.binningIndex = binningIndex
     self.completeness = completeness
     self.contamination = contamination
     self.binId = binIdFromFilename(binFile)
     self.seqs = readFasta(binFile)
     self.binFile = binFile
Example #3
0
 def __init__(self, binningIndex, completeness, contamination, binFile):
     self.binningIndex = binningIndex
     self.completeness = completeness
     self.contamination = contamination
     self.binId = binIdFromFilename(binFile)
     self.seqs = readFasta(binFile)
     self.binFile = binFile
Example #4
0
    def __readBins(self, binFiles):
        bins = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)
            bins[binId] = set(readFastaSeqIds(binFile))

        return bins
Example #5
0
    def __readBins(self, binFiles):
        bins = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)
            bins[binId] = set(readFastaSeqIds(binFile))

        return bins
Example #6
0
    def unique(self, binFiles):
        """Check if sequences are assigned to multiple bins."""

        # read seq ids from all bins
        binSeqs = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            binSeqs[binId] = readFastaSeqIds(f)

        # check for sequences assigned to multiple bins
        bDuplicates = False
        binIds = binSeqs.keys()
        for i in xrange(0, len(binIds)):
            for j in xrange(i+1, len(binIds)):
                seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]]))

                if len(seqInter) > 0:
                    bDuplicates = True
                    print '  Sequences shared between %s and %s: ' % (binIds[i], binIds[j])
                    for seqId in seqInter:
                        print '    ' + seqId
                    print ''

        if not bDuplicates:
            print '  No sequences assigned to multiple bins.'
    def unique(self, binFiles):
        """Check if sequences are assigned to multiple bins."""

        # read sequence IDs from all bins,
        # while checking for duplicate sequences within a bin
        binSeqs = {}
        for f in binFiles:
            binId = binIdFromFilename(f)

            if f.endswith('.gz'):
                openFile = gzip.open
            else:
                openFile = open

            seqIds = set()
            for line in openFile(f):
                if line[0] == '>':
                    seqId = line[1:].split(None, 1)[0]

                    if seqId in seqIds:
                        print '  [Warning] Sequence %s found multiple times in bin %s.' % (
                            seqId, binId)
                    seqIds.add(seqId)

            binSeqs[binId] = seqIds

        # check for sequences assigned to multiple bins
        bDuplicates = False
        binIds = binSeqs.keys()
        for i in xrange(0, len(binIds)):
            for j in xrange(i + 1, len(binIds)):
                seqInter = set(binSeqs[binIds[i]]).intersection(
                    set(binSeqs[binIds[j]]))

                if len(seqInter) > 0:
                    bDuplicates = True
                    print '  Sequences shared between %s and %s: ' % (
                        binIds[i], binIds[j])
                    for seqId in seqInter:
                        print '    ' + seqId
                    print ''

        if not bDuplicates:
            print '  No sequences assigned to multiple bins.'
Example #8
0
    def unique(self, binFiles):
        """Check if sequences are assigned to multiple bins."""

        # read sequence IDs from all bins,
        # while checking for duplicate sequences within a bin
        binSeqs = {}
        for f in binFiles:
            binId = binIdFromFilename(f)

            if f.endswith('.gz'):
                openFile = gzip.open
            else:
                openFile = open

            seqIds = set()
            for line in openFile(f):
                if line[0] == '>':
                    seqId = line[1:].split(None, 1)[0]

                    if seqId in seqIds:
                        print '  [Warning] Sequence %s found multiple times in bin %s.' % (seqId, binId)
                    seqIds.add(seqId)

            binSeqs[binId] = seqIds

        # check for sequences assigned to multiple bins
        bDuplicates = False
        binIds = binSeqs.keys()
        for i in xrange(0, len(binIds)):
            for j in xrange(i + 1, len(binIds)):
                seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]]))

                if len(seqInter) > 0:
                    bDuplicates = True
                    print '  Sequences shared between %s and %s: ' % (binIds[i], binIds[j])
                    for seqId in seqInter:
                        print '    ' + seqId
                    print ''

        if not bDuplicates:
            print '  No sequences assigned to multiple bins.'
Example #9
0
    def getBestCandidates(self, binFileSets, qas, minCompleteness, maxContamination):

        # Take the first set of bins as the best set yet
        bestCandidates = []
        for f in binFileSets[0]:
            if (
                qas[0].completeness(binIdFromFilename(f)) >= minCompleteness
                and qas[0].contamination(binIdFromFilename(f)) <= maxContamination
            ):
                bestCandidates.append(
                    UnionBin(
                        0, qas[0].completeness(binIdFromFilename(f)), qas[0].contamination(binIdFromFilename(f)), f
                    )
                )

        # For each bin in the second or after set,
        for binningIndex, binFileSet in enumerate(binFileSets):
            if binningIndex == 0:
                continue

            currentRoundCandidatesToAdd = []
            for binFile in binFileSet:
                # Is it >50% (by sequence) aligned with any of the bins in the best set?
                binId = binIdFromFilename(binFile)

                if (
                    qas[binningIndex].completeness(binId) >= minCompleteness
                    and qas[binningIndex].contamination(binId) <= maxContamination
                ):

                    current = UnionBin(
                        binningIndex,
                        qas[binningIndex].completeness(binId),
                        qas[binningIndex].contamination(binId),
                        binFile,
                    )

                    fiftyPercent = 0.5 * current.numBases()
                    accountedFor = False
                    for i, bestBin in enumerate(bestCandidates):
                        overlap = current.numBasesOverlapping(bestBin)
                        fiftyPercentBest = 0.5 * bestBin.numBases()

                        if overlap > fiftyPercent or overlap > fiftyPercentBest:
                            self.logger.debug(
                                "Comparing best bin %s and current bin %s, overlap is %i"
                                % (bestBin.binId, current.binId, overlap)
                            )

                        if overlap > fiftyPercent and overlap > fiftyPercentBest:
                            accountedFor = True
                            # Choose the best one
                            if current.compContSquaredScored() > bestBin.compContSquaredScored():
                                self.logger.debug("The newly found bin is better, going with that")
                                # Found a better one, replace the best bin with that
                                bestCandidates[i] = current
                                # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have
                                # the same contig, then it is possible that a bin can overlap > 50% with more
                                # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal'
                                # bin. But then should it be a 1:1 swap anyway? meh.
                                break
                        elif overlap > fiftyPercent or overlap > fiftyPercentBest:
                            self.logger.warn(
                                "Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins"
                                % (bestBin.binId, current.binId, bestBin.numBases(), current.numBases(), overlap)
                            )
                            # Bins don't overlap, continue to go through the loop again

                    if not accountedFor:
                        currentRoundCandidatesToAdd.append(current)

            # Add all the bins that hit no other bins to the bestCandidates list
            # Do this after so that bins are not compared to themselves (saves some time?)
            for b in currentRoundCandidatesToAdd:
                self.logger.debug("Adding unmatched bin %s from %s" % (b.binId, b.binningIndex))
                bestCandidates.append(b)

        return bestCandidates
Example #10
0
    def getBestCandidates(self, binFileSets, qas, minCompleteness,
                          maxContamination):

        # Take the first set of bins as the best set yet
        bestCandidates = []
        for f in binFileSets[0]:
            if qas[0].completeness(binIdFromFilename(
                    f)) >= minCompleteness and qas[0].contamination(
                        binIdFromFilename(f)) <= maxContamination:
                bestCandidates.append(
                    UnionBin(0, qas[0].completeness(binIdFromFilename(f)),
                             qas[0].contamination(binIdFromFilename(f)), f))

        # For each bin in the second or after set,
        for binningIndex, binFileSet in enumerate(binFileSets):
            if binningIndex == 0:
                continue

            currentRoundCandidatesToAdd = []
            for binFile in binFileSet:
                # Is it >50% (by sequence) aligned with any of the bins in the best set?
                binId = binIdFromFilename(binFile)

                if qas[binningIndex].completeness(
                        binId
                ) >= minCompleteness and qas[binningIndex].contamination(
                        binId) <= maxContamination:

                    current = UnionBin(binningIndex,
                                       qas[binningIndex].completeness(binId),
                                       qas[binningIndex].contamination(binId),
                                       binFile)

                    fiftyPercent = 0.5 * current.numBases()
                    accountedFor = False
                    for i, bestBin in enumerate(bestCandidates):
                        overlap = current.numBasesOverlapping(bestBin)
                        fiftyPercentBest = 0.5 * bestBin.numBases()

                        if overlap > fiftyPercent or overlap > fiftyPercentBest:
                            self.logger.debug(
                                "Comparing best bin %s and current bin %s, overlap is %i"
                                % (bestBin.binId, current.binId, overlap))

                        if overlap > fiftyPercent and overlap > fiftyPercentBest:
                            accountedFor = True
                            # Choose the best one
                            if current.compContSquaredScored(
                            ) > bestBin.compContSquaredScored():
                                self.logger.debug(
                                    "The newly found bin is better, going with that"
                                )
                                # Found a better one, replace the best bin with that
                                bestCandidates[i] = current
                                # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have
                                # the same contig, then it is possible that a bin can overlap > 50% with more
                                # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal'
                                # bin. But then should it be a 1:1 swap anyway? meh.
                                break
                        elif overlap > fiftyPercent or overlap > fiftyPercentBest:
                            self.logger.warn(
                                "Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins"
                                % (bestBin.binId, current.binId,
                                   bestBin.numBases(), current.numBases(),
                                   overlap))
                            # Bins don't overlap, continue to go through the loop again

                    if not accountedFor:
                        currentRoundCandidatesToAdd.append(current)

            # Add all the bins that hit no other bins to the bestCandidates list
            # Do this after so that bins are not compared to themselves (saves some time?)
            for b in currentRoundCandidatesToAdd:
                self.logger.debug("Adding unmatched bin %s from %s" %
                                  (b.binId, b.binningIndex))
                bestCandidates.append(b)

        return bestCandidates
Example #11
0
    def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile):
        """Identify sequences that are outliers."""

        self.logger.info('  Reading reference distributions.')
        gcBounds = readDistribution('gc_dist')
        cdBounds = readDistribution('cd_dist')
        tdBounds = readDistribution('td_dist')

        fout = open(outputFile, 'w')
        fout.write('Bin Id\tSequence Id\tSequence length\tOutlying distributions')
        fout.write('\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution))
        fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution)
        fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution)

        self.logger.info('')
        processedBins = 0
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            processedBins += 1
            self.logger.info('  Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles)))

            seqs = readFasta(binFile)

            meanGC, deltaGCs, seqGC = self.gcDist(seqs)

            genomicSig = GenomicSignatures(K=4, threads=1)
            tetraSigs = genomicSig.read(tetraProfileFile)
            binSig = self.binTetraSig(seqs, tetraSigs)
            meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)

            gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF)
            if not os.path.exists(gffFile):
                self.logger.error('  [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF)
                sys.exit(1)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)
            meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser)

            # find keys into GC and CD distributions
            closestGC = findNearest(np.array(gcBounds.keys()), meanGC)
            sampleSeqLen = gcBounds[closestGC].keys()[0]
            d = gcBounds[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0)

            closestCD = findNearest(np.array(cdBounds.keys()), meanCD)
            sampleSeqLen = cdBounds[closestCD].keys()[0]
            d = cdBounds[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)

            tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution)

            index = 0
            for seqId, seq in seqs.iteritems():
                seqLen = len(seq)

                # find GC, CD, and TD bounds
                closestSeqLen = findNearest(gcBounds[closestGC].keys(), seqLen)
                gcLowerBound = gcBounds[closestGC][closestSeqLen][gcLowerBoundKey]
                gcUpperBound = gcBounds[closestGC][closestSeqLen][gcUpperBoundKey]

                closestSeqLen = findNearest(cdBounds[closestCD].keys(), seqLen)
                cdLowerBound = cdBounds[closestCD][closestSeqLen][cdLowerBoundKey]

                closestSeqLen = findNearest(tdBounds.keys(), seqLen)
                tdBound = tdBounds[closestSeqLen][tdBoundKey]

                outlyingDists = []
                if deltaGCs[index] < gcLowerBound or deltaGCs[index] > gcUpperBound:
                    outlyingDists.append('GC')

                if deltaCDs[index] < cdLowerBound:
                    outlyingDists.append('CD')

                if deltaTDs[index] > tdBound:
                    outlyingDists.append('TD')

                if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3):
                    fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists))
                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100))
                    fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n')

                index += 1

        fout.close()