Beispiel #1
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Beispiel #2
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist,
                   axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while (end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs,
                            seqLens,
                            c=abs(deltaGCs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' %
                               (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Beispiel #3
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist,
                   axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution('td_dist')

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while (end < seqLen):
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r'$\Delta$ TD')
        axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs,
                            seqLens,
                            c=abs(deltaTDs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD)
        axesDeltaTD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]
                                    ) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Beispiel #4
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r')
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Beispiel #5
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
        # Read reference distributions from file
        dist = readDistribution('gc_dist')

        # get GC for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for _, seq in seqs.iteritems():
            start = 0
            end = self.options.gc_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                a, c, g, t = baseCount(seq[start:end])
                try:
                    data.append(float(g + c) / (a + c + g + t))
                except:
                    # it is possible to reach a long stretch of
                    # N's that causes a division by zero error

                    pass

                start = end
                end += self.options.gc_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.gc_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% GC')
        axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get GC bin statistics
        binTools = BinTools()
        meanGC, deltaGCs, _ = binTools.gcDist(seqs)

        # Delta-GC vs Sequence length plot
        axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100))
        axesDeltaGC.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaGC.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestGC = findNearest(np.array(dist.keys()), meanGC)

            # find closest distribution values
            sampleSeqLen = dist[closestGC].keys()[0]
            d = dist[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestGC]:
                xL.append(dist[closestGC][windowSize][gcLowerBoundKey])
                xU.append(dist[closestGC][windowSize][gcUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaGC.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaGC.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaGC.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaGC.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaGC.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaGC.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaGC.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaGC.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Beispiel #6
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution("td_dist")

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while end < seqLen:
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r"$\Delta$ TD")
        axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)")

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r")
        axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD)
        axesDeltaTD.set_ylabel("Sequence length (kbp)")

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = "%.1f" % (float(seqLen) / 1000)
            label = label.replace(".0", "")  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)
Beispiel #7
0
    def identifyOutliers(self, outDir, binFiles, tetraProfileFile,
                         distribution, reportType, outputFile):
        """Identify sequences that are outliers."""

        self.logger.info('Reading reference distributions.')
        gcBounds = readDistribution('gc_dist')
        cdBounds = readDistribution('cd_dist')
        tdBounds = readDistribution('td_dist')

        fout = open(outputFile, 'w')
        fout.write(
            'Bin Id\tSequence Id\tSequence length\tOutlying distributions')
        fout.write(
            '\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)'
            % (distribution, distribution))
        fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' %
                   distribution)
        fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' %
                   distribution)

        processedBins = 0
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            processedBins += 1
            self.logger.info('Finding outliers in %s (%d of %d).' %
                             (binId, processedBins, len(binFiles)))

            seqs = readFasta(binFile)

            meanGC, deltaGCs, seqGC = self.gcDist(seqs)

            genomicSig = GenomicSignatures(K=4, threads=1)
            tetraSigs = genomicSig.read(tetraProfileFile)
            binSig = self.binTetraSig(seqs, tetraSigs)
            meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

            gffFile = os.path.join(outDir, 'bins', binId,
                                   DefaultValues.PRODIGAL_GFF)
            if not os.path.exists(gffFile):
                self.logger.error(
                    'Missing gene feature file (%s). This plot if not compatible with the --genes option.\n'
                    % DefaultValues.PRODIGAL_GFF)
                sys.exit(1)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)
            meanCD, deltaCDs, CDs = self.codingDensityDist(
                seqs, prodigalParser)

            # find keys into GC and CD distributions
            closestGC = findNearest(np.array(list(gcBounds.keys())), meanGC)
            sampleSeqLen = list(gcBounds[closestGC].keys())[0]
            d = gcBounds[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)
            gcUpperBoundKey = findNearest(list(d.keys()),
                                          (100 + distribution) / 2.0)

            closestCD = findNearest(np.array(list(cdBounds.keys())), meanCD)
            sampleSeqLen = list(cdBounds[closestCD].keys())[0]
            d = cdBounds[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)

            tdBoundKey = findNearest(
                list(tdBounds[list(tdBounds.keys())[0]].keys()), distribution)

            index = 0
            for seqId, seq in seqs.items():
                seqLen = len(seq)

                # find GC, CD, and TD bounds
                closestSeqLen = findNearest(list(gcBounds[closestGC].keys()),
                                            seqLen)
                gcLowerBound = gcBounds[closestGC][closestSeqLen][
                    gcLowerBoundKey]
                gcUpperBound = gcBounds[closestGC][closestSeqLen][
                    gcUpperBoundKey]

                closestSeqLen = findNearest(list(cdBounds[closestCD].keys()),
                                            seqLen)
                cdLowerBound = cdBounds[closestCD][closestSeqLen][
                    cdLowerBoundKey]

                closestSeqLen = findNearest(list(tdBounds.keys()), seqLen)
                tdBound = tdBounds[closestSeqLen][tdBoundKey]

                outlyingDists = []
                if deltaGCs[index] < gcLowerBound or deltaGCs[
                        index] > gcUpperBound:
                    outlyingDists.append('GC')

                if deltaCDs[index] < cdLowerBound:
                    outlyingDists.append('CD')

                if deltaTDs[index] > tdBound:
                    outlyingDists.append('TD')

                if (reportType == 'any' and len(outlyingDists) >= 1) or (
                        reportType == 'all' and len(outlyingDists) == 3):
                    fout.write(binId + '\t' + seqId + '\t%d' % len(seq) +
                               '\t' + ','.join(outlyingDists))
                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' %
                               (seqGC[index] * 100, meanGC * 100,
                                (meanGC + gcLowerBound) * 100,
                                (meanGC + gcUpperBound) * 100))
                    fout.write('\t%.1f\t%.1f\t%.1f' %
                               (CDs[index] * 100, meanCD * 100,
                                (meanCD + cdLowerBound) * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' %
                               (deltaTDs[index], meanTD, tdBound) + '\n')

                index += 1

        fout.close()