Ejemplo n.º 1
0
    def distributionPlots(self, options):
        """Reference distribution plot command"""
        self.logger.info(
            '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        genomicSignatures = GenomicSignatures(K=4, threads=1)
        tetraSigs = genomicSignatures.read(options.tetra_profile)

        plots = DistributionPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting reference distribution plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            binId = binIdFromFilename(f)
            plots.plot(f, tetraSigs, options.distributions)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.ref_dist_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 2
0
    def identifyOutliers(self, outDir, binFiles, tetraProfileFile,
                         distribution, reportType, outputFile):
        """Identify sequences that are outliers."""

        self.logger.info('Reading reference distributions.')
        gcBounds = readDistribution('gc_dist')
        cdBounds = readDistribution('cd_dist')
        tdBounds = readDistribution('td_dist')

        fout = open(outputFile, 'w')
        fout.write(
            'Bin Id\tSequence Id\tSequence length\tOutlying distributions')
        fout.write(
            '\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)'
            % (distribution, distribution))
        fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' %
                   distribution)
        fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' %
                   distribution)

        processedBins = 0
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            processedBins += 1
            self.logger.info('Finding outliers in %s (%d of %d).' %
                             (binId, processedBins, len(binFiles)))

            seqs = readFasta(binFile)

            meanGC, deltaGCs, seqGC = self.gcDist(seqs)

            genomicSig = GenomicSignatures(K=4, threads=1)
            tetraSigs = genomicSig.read(tetraProfileFile)
            binSig = self.binTetraSig(seqs, tetraSigs)
            meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

            gffFile = os.path.join(outDir, 'bins', binId,
                                   DefaultValues.PRODIGAL_GFF)
            if not os.path.exists(gffFile):
                self.logger.error(
                    'Missing gene feature file (%s). This plot if not compatible with the --genes option.\n'
                    % DefaultValues.PRODIGAL_GFF)
                sys.exit(1)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)
            meanCD, deltaCDs, CDs = self.codingDensityDist(
                seqs, prodigalParser)

            # find keys into GC and CD distributions
            closestGC = findNearest(np.array(list(gcBounds.keys())), meanGC)
            sampleSeqLen = list(gcBounds[closestGC].keys())[0]
            d = gcBounds[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)
            gcUpperBoundKey = findNearest(list(d.keys()),
                                          (100 + distribution) / 2.0)

            closestCD = findNearest(np.array(list(cdBounds.keys())), meanCD)
            sampleSeqLen = list(cdBounds[closestCD].keys())[0]
            d = cdBounds[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)

            tdBoundKey = findNearest(
                list(tdBounds[list(tdBounds.keys())[0]].keys()), distribution)

            index = 0
            for seqId, seq in seqs.items():
                seqLen = len(seq)

                # find GC, CD, and TD bounds
                closestSeqLen = findNearest(list(gcBounds[closestGC].keys()),
                                            seqLen)
                gcLowerBound = gcBounds[closestGC][closestSeqLen][
                    gcLowerBoundKey]
                gcUpperBound = gcBounds[closestGC][closestSeqLen][
                    gcUpperBoundKey]

                closestSeqLen = findNearest(list(cdBounds[closestCD].keys()),
                                            seqLen)
                cdLowerBound = cdBounds[closestCD][closestSeqLen][
                    cdLowerBoundKey]

                closestSeqLen = findNearest(list(tdBounds.keys()), seqLen)
                tdBound = tdBounds[closestSeqLen][tdBoundKey]

                outlyingDists = []
                if deltaGCs[index] < gcLowerBound or deltaGCs[
                        index] > gcUpperBound:
                    outlyingDists.append('GC')

                if deltaCDs[index] < cdLowerBound:
                    outlyingDists.append('CD')

                if deltaTDs[index] > tdBound:
                    outlyingDists.append('TD')

                if (reportType == 'any' and len(outlyingDists) >= 1) or (
                        reportType == 'all' and len(outlyingDists) == 3):
                    fout.write(binId + '\t' + seqId + '\t%d' % len(seq) +
                               '\t' + ','.join(outlyingDists))
                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' %
                               (seqGC[index] * 100, meanGC * 100,
                                (meanGC + gcLowerBound) * 100,
                                (meanGC + gcUpperBound) * 100))
                    fout.write('\t%.1f\t%.1f\t%.1f' %
                               (CDs[index] * 100, meanCD * 100,
                                (meanCD + cdLowerBound) * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' %
                               (deltaTDs[index], meanTD, tdBound) + '\n')

                index += 1

        fout.close()
Ejemplo n.º 3
0
    def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile):
        """Identify sequences that are outliers."""

        self.logger.info('  Reading reference distributions.')
        gcBounds = readDistribution('gc_dist')
        cdBounds = readDistribution('cd_dist')
        tdBounds = readDistribution('td_dist')

        fout = open(outputFile, 'w')
        fout.write('Bin Id\tSequence Id\tSequence length\tOutlying distributions')
        fout.write('\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution))
        fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution)
        fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution)

        self.logger.info('')
        processedBins = 0
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            processedBins += 1
            self.logger.info('  Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles)))

            seqs = readFasta(binFile)

            meanGC, deltaGCs, seqGC = self.gcDist(seqs)

            genomicSig = GenomicSignatures(K=4, threads=1)
            tetraSigs = genomicSig.read(tetraProfileFile)
            binSig = self.binTetraSig(seqs, tetraSigs)
            meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)

            gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF)
            if not os.path.exists(gffFile):
                self.logger.error('  [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF)
                sys.exit(1)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)
            meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser)

            # find keys into GC and CD distributions
            closestGC = findNearest(np.array(gcBounds.keys()), meanGC)
            sampleSeqLen = gcBounds[closestGC].keys()[0]
            d = gcBounds[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0)

            closestCD = findNearest(np.array(cdBounds.keys()), meanCD)
            sampleSeqLen = cdBounds[closestCD].keys()[0]
            d = cdBounds[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)

            tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution)

            index = 0
            for seqId, seq in seqs.iteritems():
                seqLen = len(seq)

                # find GC, CD, and TD bounds
                closestSeqLen = findNearest(gcBounds[closestGC].keys(), seqLen)
                gcLowerBound = gcBounds[closestGC][closestSeqLen][gcLowerBoundKey]
                gcUpperBound = gcBounds[closestGC][closestSeqLen][gcUpperBoundKey]

                closestSeqLen = findNearest(cdBounds[closestCD].keys(), seqLen)
                cdLowerBound = cdBounds[closestCD][closestSeqLen][cdLowerBoundKey]

                closestSeqLen = findNearest(tdBounds.keys(), seqLen)
                tdBound = tdBounds[closestSeqLen][tdBoundKey]

                outlyingDists = []
                if deltaGCs[index] < gcLowerBound or deltaGCs[index] > gcUpperBound:
                    outlyingDists.append('GC')

                if deltaCDs[index] < cdLowerBound:
                    outlyingDists.append('CD')

                if deltaTDs[index] > tdBound:
                    outlyingDists.append('TD')

                if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3):
                    fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists))
                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100))
                    fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n')

                index += 1

        fout.close()