Ejemplo n.º 1
0
    def lengthHistogram(self, options):
        """Sequence length histogram command"""

        self.logger.info(
            '[CheckM - len_hist] Creating sequence length histogram.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plot = LengthHistogram(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting sequence length histogram for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            plot.plot(f)

            outputFile = os.path.join(
                options.output_dir, binId) + '.len_hist.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 2
0
    def sequenceStats(self, outDir, binFile):
        """Calculate statistics for all sequences within a bin."""

        # read scaffolds
        seqs = readFasta(binFile)

        seqStats = {}
        for seqId in seqs:
            seqStats[seqId] = {}

        self.calculateGC(seqs, seqStats)
        self.calculateSeqStats(seqs, seqStats)

        binId = binIdFromFilename(binFile)
        aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
        if os.path.exists(aaFile):
            aaGenes = readFasta(aaFile)
            for geneId, gene in aaGenes.iteritems():
                seqId = geneId[0:geneId.rfind('_')]
                seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
                seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3
        else:
            # missing amino acid file likely indicates users used a pre-called gene file, so
            # just set some defaults
            seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
            seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3

        return seqStats
Ejemplo n.º 3
0
    def distributionPlots(self, options):
        """Reference distribution plot command"""
        self.logger.info(
            '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        genomicSignatures = GenomicSignatures(K=4, threads=1)
        tetraSigs = genomicSignatures.read(options.tetra_profile)

        plots = DistributionPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting reference distribution plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            binId = binIdFromFilename(f)
            plots.plot(f, tetraSigs, options.distributions)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.ref_dist_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 4
0
    def nxPlot(self, options):
        """Nx-plot command"""

        self.logger.info('[CheckM - nx_plot] Creating Nx-plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        nx = NxPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info('Plotting Nx-plot for %s (%d of %d)' %
                             (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            nx.plot(f)

            outputFile = os.path.join(options.output_dir,
                                      binId) + '.nx_plot.' + options.image_type
            nx.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 5
0
    def codingDensityPlot(self, options):
        """Coding density plot command"""
        self.logger.info(
            '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plots = CodingDensityPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting coding density plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, options.distributions)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.coding_density_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 6
0
    def coveragePcaPlot(self, options):
        """PCA plot of coverage profiles"""
        self.logger.info(
            '[CheckM - cov_pca] Creating PCA plot of coverage profiles.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.coverage_file)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        seqIds = []
        coverageProfiles = []
        for binId, seqDict in coverageStats.items():
            for seqId, bamDict in seqDict.items():
                seqIds.append(seqId)

                coverages = []
                for _, coverage in bamDict.items():
                    coverages.append(coverage)

                coverageProfiles.append(coverages)

        coverageProfiles = np.array(coverageProfiles)
        if coverageProfiles.shape[1] < 2:
            self.logger.error(
                'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.'
            )
            sys.exit(1)

        self.logger.info('Computing PCA of coverage profiles.\n')
        pca = PCA()
        pc, variance = pca.pcaMatrix(coverageProfiles,
                                     fraction=1.0,
                                     bCenter=True,
                                     bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of coverage profiles for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.cov_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 7
0
    def parallelCoordPlot(self, options):
        """Parallel coordinate plot command"""

        self.logger.info(
            '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)
        checkFileExists(options.coverage_file)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read coverage stats file
        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        # calculate sequence stats for all bins
        self.logger.info('Calculating sequence statistics for each bin.')
        binStats = BinStatistics()
        seqStats = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)

        # create plot for each bin

        plot = ParallelCoordPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting parallel coordinates for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plot.plot(binId, seqStats, coverageStats)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.paralel_coord_plot.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 8
0
    def __sortBinsByCompleteness(self, binFiles, binStatsExt):
        sortedBinIds = []
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            sortedBinIds.append([binId, binStatsExt[binId]['Completeness']])

        sortedBinIds.sort(key=itemgetter(1, 0))

        return [x[0] for x in sortedBinIds]
Ejemplo n.º 9
0
    def __processBin(self, outDir, queueIn, queueOut):
        """Thread safe bin processing."""
        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binStats = {}
            scaffoldStats = {}

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # read scaffolds
            scaffolds = readFasta(binFile)
            for seqId in scaffolds:
                scaffoldStats[seqId] = {}

            # calculate GC statistics
            GC, stdGC = self.calculateGC(scaffolds, scaffoldStats)
            binStats['GC'] = GC
            binStats['GC std'] = stdGC

            # calculate statistics related to scaffold lengths
            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats)
            binStats['Genome size'] = genomeSize
            binStats['# ambiguous bases'] = numAmbiguousBases
            binStats['# scaffolds'] = len(scaffolds)
            binStats['# contigs'] = numContigs
            binStats['Longest scaffold'] = maxScaffoldLen
            binStats['Longest contig'] = maxContigLen
            binStats['N50 (scaffolds)'] = scaffold_N50
            binStats['N50 (contigs)'] = contig_N50

            # calculate coding density statistics
            codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats)
            binStats['Coding density'] = codingDensity
            binStats['Translation table'] = translationTable
            binStats['# predicted genes'] = numORFs

            queueOut.put((binId, binStats, scaffoldStats))
Ejemplo n.º 10
0
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile)
  
            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                         '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr,
                         bKeepAlignment)
          
            queueOut.put((binId, hmmModelFile))
Ejemplo n.º 11
0
    def gcBiasPlot(self, options):
        """GC bias plot command"""

        self.logger.info(
            '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverageWindows = CoverageWindows(options.threads)
        coverageProfile = coverageWindows.run(binFiles, options.bam_file,
                                              options.all_reads,
                                              options.min_align,
                                              options.max_edit_dist,
                                              options.window_size)

        plots = GcBiasPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info('Plotting GC plots for %s (%d of %d)' %
                             (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, coverageProfile)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.gc_bias_plot.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 12
0
    def tetraPcaPlot(self, options):
        """PCA plot of tetranucleotide signatures"""
        self.logger.info(
            '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        self.logger.info('Computing PCA of tetranuclotide signatures.\n')
        pca = PCA()
        seqIds, pc, variance = pca.pcaFile(options.tetra_profile,
                                           fraction=1.0,
                                           bCenter=True,
                                           bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.tetra_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Ejemplo n.º 13
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Ejemplo n.º 14
0
    def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        # get bin id of binned contigs
        self.logger.info('  Determining bin assignment of sequences.')
        seqIdToBinId = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqIds = readFastaSeqIds(f)
            for seqId in seqIds:
                seqIdToBinId[seqId] = binId

        # identify 16S reads from contigs/scaffolds
        self.logger.info('  Identifying SSU rRNAs on sequences.')
        self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu'))

        # read HMM hits
        hitsPerDomain = {}
        for domain in ['archaea', 'bacteria', 'euk']:
            hits = {}

            seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold)
            if len(seqInfo) > 0:
                for seqId, seqHits in seqInfo.iteritems():
                    for hit in seqHits:
                        self.__addHit(hits, seqId, hit, concatenateThreshold)

            hitsPerDomain[domain] = hits

        # find best domain hit for each sequence
        bestHits = {}
        for _, hits in hitsPerDomain.iteritems():
            for seqId, info in hits.iteritems():
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                self.__addDomainHit(bestHits, seqId, info)

        # write summary file and putative SSU rRNAs to file
        summaryFile = os.path.join(outputDir, 'ssu_summary.tsv')
        summaryOut = open(summaryFile, 'w')
        summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n')

        seqFile = os.path.join(outputDir, 'ssu.fna')
        seqOut = open(seqFile, 'w')

        seqs = readFasta(contigFile)

        hitsToBins = {}
        for seqId in bestHits:
            origSeqId = seqId
            if '-#' in seqId:
                seqId = seqId[0:seqId.rfind('-#')]

            if seqId in seqIdToBinId:
                binId = seqIdToBinId[seqId]
            else:
                binId = DefaultValues.UNBINNED

            seqInfo = [origSeqId] + bestHits[origSeqId]
            hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo]

        for binId in sorted(hitsToBins.keys()):
            for seqInfo in hitsToBins[binId]:
                seqId = seqInfo[0]
                if '-#' in seqId:
                    seqId = seqId[0:seqId.rfind('-#')]

                seq = seqs[seqId]
                summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n')
                seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n')
                seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n')

        summaryOut.close()
        seqOut.close()

        self.logger.info('')
        self.logger.info('  Identified ' + str(len(bestHits)) + ' putative SSU genes:')
        self.logger.info('    Summary of identified hits written to: ' + summaryFile)
        self.logger.info('    SSU sequences written to: ' + seqFile)
Ejemplo n.º 15
0
    def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC):
        """Calculate coverage of sequences for each BAM file."""

        # determine bin assignment of each sequence
        self.logger.info('  Determining bin assignment of each sequence.')

        seqIdToBinId = {}
        seqIdToSeqLen = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            seqs = readFasta(binFile)
            for seqId, seq in seqs.iteritems():
                seqIdToBinId[seqId] = binId
                seqIdToSeqLen[seqId] = len(seq)

        # process each fasta file
        self.logger.info("  Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads))

        # make sure all BAM files are sorted
        self.numFiles = len(bamFiles)
        for bamFile in bamFiles:
            if not os.path.exists(bamFile + '.bai'):
                self.logger.error('  [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n')
                sys.exit(1)

        # calculate coverage of each BAM file
        coverageInfo = {}
        numFilesStarted = 0
        for bamFile in bamFiles:
            numFilesStarted += 1
            self.logger.info('  Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

            coverageInfo[bamFile] = mp.Manager().dict()
            coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile])

        # redirect output
        self.logger.info('  Writing coverage information to file.')
        oldStdOut = reassignStdOut(outFile)

        header = 'Sequence Id\tBin Id\tSequence length (bp)'
        for bamFile in bamFiles:
            header += '\tBam Id\tCoverage\tMapped reads'

        print(header)

        # get length of all seqs
        for bamFile, seqIds in coverageInfo.iteritems():
            for seqId in seqIds.keys():
                seqIdToSeqLen[seqId] = seqIds[seqId].seqLen

        # write coverage stats for all scaffolds to file
        for seqId, seqLen in seqIdToSeqLen.iteritems():
            rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen)
            for bamFile in bamFiles:
                bamId = binIdFromFilename(bamFile)

                if seqId in coverageInfo[bamFile]:
                    rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads)
                else:
                    rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0)

            print(rowStr)

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Ejemplo n.º 16
0
    def plot(self, binFile, markerGeneStats, binStats):
        binId = binIdFromFilename(binFile)

        markerGenesPerSeq, _markerGeneNum = self.getMarkerGenesPerSeq(markerGeneStats)

        if len(markerGenesPerSeq) == 0:
            return False

        # Get length of sequences with one or more marker genes
        seqs = readFasta(binFile)
        seqLens = {}
        longestSeq = 0
        binSize = 0
        for seqId, seq in seqs.iteritems():
            seqLen = len(seq)
            binSize += seqLen

            if seqId not in markerGenesPerSeq:
                continue

            seqLens[seqId] = seqLen
            if seqLen > longestSeq:
                longestSeq = seqLen

        sortedSeqLens = sorted(seqLens.iteritems(), key=operator.itemgetter(1), reverse=True)

        MAX_BINS = 100
        plotBinSize = self.roundUpToNearest100(float(longestSeq) / MAX_BINS)
        yLabels = [x[0] for x in sortedSeqLens]

        # get position of genes in bin
        prodigalFastaParser = ProdigalFastaParser()
        geneFile = os.path.join(self.options.out_folder, 'bins', binId, DefaultValues.PRODIGAL_AA)
        genePos = prodigalFastaParser.genePositions(geneFile)

        # Set size of figure
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        yLabelBounds = self.yLabelExtents(yLabels, self.options.font_size)

        heightBottomLabels = 0.4 + self.options.fig_padding  # inches
        widthSideLabel = yLabelBounds.width * self.options.width + self.options.fig_padding  # inches

        widthPerBin = (self.options.width - widthSideLabel - self.options.fig_padding) / MAX_BINS

        titleHeight = 0.2
        HEIGHT_PER_ROW = 0.2
        height = HEIGHT_PER_ROW * len(sortedSeqLens) + heightBottomLabels + self.options.fig_padding + titleHeight
        rowBinHeight = widthPerBin / HEIGHT_PER_ROW

        self.fig.set_size_inches(self.options.width, height)
        axes = self.fig.add_axes([widthSideLabel / self.options.width, heightBottomLabels / height, \
                                                                        1.0 - (widthSideLabel + self.options.fig_padding) / self.options.width, \
                                                                        1.0 - (heightBottomLabels + self.options.fig_padding + titleHeight) / height])

        # set plot axis
        axes.set_xlim([0, MAX_BINS + 0.1])
        axes.set_xlabel('Position (' + str(plotBinSize) + ' bp/bin)')

        axes.set_ylim([0, len(sortedSeqLens)])
        axes.set_yticks(np.arange(0.5, len(sortedSeqLens) + 0.5, 1.0))

        axes.set_yticklabels(yLabels)

        # legend
        colours = [(1.0, 1.0, 1.0), (127 / 255.0, 201 / 255.0, 127 / 255.0), (255 / 255.0, 192 / 255.0, 134 / 255.0), (190 / 255.0, 174 / 255.0, 212 / 255.0), (0.0, 0.0, 0.0)]
        discreteColourMap = mpl.colors.ListedColormap(colours)
        axisColourMap = self.fig.add_axes([self.options.fig_padding / self.options.width, self.options.fig_padding / height, 0.15, 0.03 * (self.options.width / height)])
        colourBar = mpl.colorbar.ColorbarBase(axisColourMap, cmap=discreteColourMap, norm=mpl.colors.Normalize(vmin=0, vmax=1), orientation='horizontal', drawedges=True)
        colourBar.set_ticks([0.1, 0.3, 0.5, 0.7, 0.9])
        colourBar.set_ticklabels(['0', '1', '2', '3', '4+'])
        # colourBar.outline.set_color(self.axesColour)
        colourBar.outline.set_linewidth(0.5)
        # colourBar.dividers.set_color(self.axesColour)
        colourBar.dividers.set_linewidth(0.5)

        for a in axisColourMap.xaxis.majorTicks:
            a.tick1On = False
            a.tick2On = False

        # plot each bin
        binPosX = 0.5
        for seqId, seqLen in sortedSeqLens:
            markerCount = [0] * int(math.ceil(float(seqLen) / plotBinSize))
            for geneId, _markerGeneId, geneStartPos, _geneEndPos in markerGenesPerSeq[seqId]:
                binPos = int(float(genePos[geneId][0] + geneStartPos) / plotBinSize)
                markerCount[binPos] += 1

            for i in xrange(0, len(markerCount)):
                if markerCount[i] < len(colours):
                    axes.add_patch(Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[markerCount[i]], lw=0.2))
                else:
                    axes.add_patch(Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[-1], lw=0.2))

            binPosX += 1.0

        # set plot title
        titleStr = binId + '\n'
        titleStr += '(%.2f Mbp, %d seqs, %.2f%% complete, %.2f%% contamination)' % (float(binSize) / 1e6, len(seqs), binStats['Completeness'], binStats['Contamination'])
        axes.set_title(titleStr)

        # Prettify plot
        for a in axes.yaxis.majorTicks:
            a.tick1On = False
            a.tick2On = False

        for a in axes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axes.xaxis.get_ticklines():
            line.set_color(self.axesColour)
            line.set_ms(2)

        for loc, spine in axes.spines.iteritems():
            if loc in ['left', 'right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        self.draw()

        return True