Esempio n. 1
0
    def calculateGC(self, seqs, seqStats):
        """Calculate fraction of nucleotides that are G or C."""
        totalGC = 0
        totalAT = 0
        gcPerSeq = []
        for seqId, seq in seqs.iteritems():
            a, c, g, t = baseCount(seq)

            gc = g + c
            at = a + t

            totalGC += gc
            totalAT += at

            if (gc + at) > 0:
                gcContent = float(gc) / (gc + at)
            else:
                gcContent = 0.0
            seqStats[seqId]['GC'] = gcContent

            if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD:
                gcPerSeq.append(gcContent)

        if (totalGC + totalAT) > 0:
            GC = float(totalGC) / (totalGC + totalAT)
        else:
            GC = 0.0

        varGC = 0
        if len(gcPerSeq) > 1:
            varGC = mean(map(lambda x: (x - GC) ** 2, gcPerSeq))

        return GC, math.sqrt(varGC)
Esempio n. 2
0
    def calculateGC(self, seqs, seqStats=None):
        """Calculate fraction of nucleotides that are G or C."""
        totalGC = 0
        totalAT = 0
        gcPerSeq = []
        for seqId, seq in seqs.items():
            a, c, g, t = baseCount(seq)

            gc = g + c
            at = a + t

            totalGC += gc
            totalAT += at

            if (gc + at) > 0:
                gcContent = float(gc) / (gc + at)
            else:
                gcContent = 0.0

            if seqStats:
                seqStats[seqId]['GC'] = gcContent

            if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD:
                gcPerSeq.append(gcContent)

        if (totalGC + totalAT) > 0:
            GC = float(totalGC) / (totalGC + totalAT)
        else:
            GC = 0.0

        varGC = 0
        if len(gcPerSeq) > 1:
            varGC = mean(list(map(lambda x: (x - GC)**2, gcPerSeq)))

        return GC, math.sqrt(varGC)
 def testBaseCount(self):
     """Verify computation of base count on mixed-case sequence."""
     a, c, g, t = baseCount('ACGTacgtNnUu')
     self.assertEqual(a, 2)
     self.assertEqual(c, 2)
     self.assertEqual(g, 2)
     self.assertEqual(t, 4)
Esempio n. 4
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('  Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('    Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('  Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('    Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('  Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('    Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('')
        self.logger.info('  Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('  Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Esempio n. 5
0
    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
        checkFileExists(seqFile)

        # get list of sequences in bins
        self.logger.info('Reading binned sequences.')

        binnedSeqs = {}
        totalBinnedBases = 0
        for binFile in binFiles:
            seqs = readFasta(binFile)
            binnedSeqs.update(seqs)
            for seq in seqs.values():
                totalBinnedBases += len(seq)

        self.logger.info('  Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))

        # get list of all sequences
        self.logger.info('Reading all sequences.')
        allSeqs = readFasta(seqFile)
        totalBases = 0
        for seq in allSeqs.values():
            totalBases += len(seq)
        self.logger.info('  Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen)
        seqOut = open(outSeqFile, 'w')

        statsOut = open(outStatsFile, 'w')
        statsOut.write('Sequence Id\tLength\tGC\n')

        unbinnedCount = 0
        unbinnedBases = 0
        for seqId, seq in allSeqs.iteritems():
            if seqId not in binnedSeqs:
                if len(seq) >= minSeqLen:
                    unbinnedCount += 1
                    seqOut.write('>' + seqId + '\n')
                    seqOut.write(seq + '\n')

                    unbinnedBases += len(seq)

                    a, c, g, t = baseCount(seq)

                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))

        seqOut.close()
        statsOut.close()

        self.logger.info('  Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))

        self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
        self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
Esempio n. 6
0
    def gcDist(self, seqs):
        """GC statistics for bin."""
        GCs = []
        gcTotal = 0
        basesTotal = 0
        for _, seq in seqs.items():
            a, c, g, t = baseCount(seq)
            gc = g + c
            bases = a + c + g + t

            GCs.append(float(gc) / (bases))

            gcTotal += gc
            basesTotal += bases

        meanGC = float(gcTotal) / basesTotal
        deltaGCs = np.array(GCs) - meanGC

        return meanGC, deltaGCs, GCs
Esempio n. 7
0
    def gcDist(self, seqs):
        """GC statistics for bin."""
        GCs = []
        gcTotal = 0
        basesTotal = 0
        for _, seq in seqs.iteritems():
            a, c, g, t = baseCount(seq)
            gc = g + c
            bases = a + c + g + t

            GCs.append(float(gc) / (bases))

            gcTotal += gc
            basesTotal += bases

        meanGC = float(gcTotal) / basesTotal
        deltaGCs = np.array(GCs) - meanGC

        return meanGC, deltaGCs, GCs
Esempio n. 8
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist,
                   axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while (end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs,
                            seqLens,
                            c=abs(deltaGCs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' %
                               (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 9
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 10
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r')
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 11
0
    def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes):

        # get GC for windows
        seqs = readFasta(binFile)

        gcProfile = {}
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.window_size

            windowGCs = []
            while(end < len(seq)):
                a, c, g, t = baseCount(seq[start:end])
                windowGCs.append(float(g + c) / (a + c + g + t))

                start = end
                end += self.options.window_size

            a, c, g, t = baseCount(seq)
            seqGC = float(g + c) / (a + c + g + t)
            gcProfile[seqId] = [seqGC, windowGCs]

        # plot GC vs coverage for windows
        gc = []
        coverage = []
        for seqId, gcInfo in gcProfile.iteritems():
            gc += gcInfo[1]
            coverage += coverageProfile[seqId][1]

        windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap=pylab.cm.Greys)
        windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100))
        windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))

        # plot linear regression line
        if len(gc) > 1:
            slope, inter = polyfit(gc, coverage,1)
            fit_fn = poly1d([slope, inter]) # fit_fn is now a function which takes in x and returns an estimate for y
            windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5)
            windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope))
        else:
            # not possible to calculate best fit line
            windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size)

        # Prettify plot
        for a in windowAxes.yaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for a in windowAxes.xaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for line in windowAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in windowAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in windowAxes.spines.iteritems():
            if loc in ['right','top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # plot GC vs coverage for entire sequences
        gc = []
        coverage = []
        seqLen = []
        for seqId, gcInfo in gcProfile.iteritems():
            gc.append(gcInfo[0])
            coverage.append(coverageProfile[seqId][0])
            seqLen.append(len(seqs[seqId]))

        # set marker size proportional to sequence length
        markerSize = log(array(seqLen)) # log-scale
        markerSize = (markerSize - min(markerSize)) / max(markerSize) # normalize between 0 and 1
        markerSize = markerSize*200 + 10 # normalize between 10 and 200

        seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap=pylab.cm.Greys)
        seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100))
        seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))
        seqAxes.set_title('GC vs. Coverage\nIndividual Sequences')

        # Prettify plot
        for a in seqAxes.yaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for a in seqAxes.xaxis.majorTicks:
            a.tick1On=True
            a.tick2On=False

        for line in seqAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in seqAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in seqAxes.spines.iteritems():
            if loc in ['right','top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 12
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Esempio n. 13
0
    def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes):

        # get GC for windows
        seqs = readFasta(binFile)

        gcProfile = {}
        for seqId, seq in seqs.items():
            start = 0
            end = self.options.window_size

            windowGCs = []
            while (end < len(seq)):
                a, c, g, t = baseCount(seq[start:end])
                windowGCs.append(float(g + c) / (a + c + g + t))

                start = end
                end += self.options.window_size

            a, c, g, t = baseCount(seq)
            seqGC = float(g + c) / (a + c + g + t)
            gcProfile[seqId] = [seqGC, windowGCs]

        # plot GC vs coverage for windows
        gc = []
        coverage = []
        for seqId, gcInfo in gcProfile.items():
            gc += gcInfo[1]
            coverage += coverageProfile[seqId][1]

        windowAxes.scatter(gc,
                           coverage,
                           c=abs(array(coverage)),
                           s=10,
                           lw=0.5,
                           cmap='gray_r')
        windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100))
        windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))

        # plot linear regression line
        if len(gc) > 1:
            slope, inter = polyfit(gc, coverage, 1)
            fit_fn = poly1d(
                [slope, inter]
            )  # fit_fn is now a function which takes in x and returns an estimate for y
            windowAxes.plot([min(gc), max(gc)],
                            fit_fn([min(gc), max(gc)]),
                            '--r',
                            lw=0.5)
            windowAxes.set_title(
                'GC vs. Coverage\n(window size = %d bp, slope = %.2f)' %
                (self.options.window_size, slope))
        else:
            # not possible to calculate best fit line
            windowAxes.set_title(
                'GC vs. Coverage\n(window size = %d bp, no best fit line)' %
                self.options.window_size)

        # Prettify plot
        for a in windowAxes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in windowAxes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in windowAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in windowAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in windowAxes.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # plot GC vs coverage for entire sequences
        gc = []
        coverage = []
        seqLen = []
        for seqId, gcInfo in gcProfile.items():
            gc.append(gcInfo[0])
            coverage.append(coverageProfile[seqId][0])
            seqLen.append(len(seqs[seqId]))

        # set marker size proportional to sequence length
        markerSize = log(array(seqLen))  # log-scale
        markerSize = (markerSize - min(markerSize)) / max(
            markerSize)  # normalize between 0 and 1
        markerSize = markerSize * 200 + 10  # normalize between 10 and 200

        seqAxes.scatter(gc,
                        coverage,
                        c=abs(array(coverage)),
                        s=markerSize,
                        lw=0.5,
                        cmap='gray_r')
        seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100))
        seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage))
        seqAxes.set_title('GC vs. Coverage\nIndividual Sequences')

        # Prettify plot
        for a in seqAxes.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in seqAxes.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in seqAxes.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in seqAxes.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in seqAxes.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)