예제 #1
0
    def calculateCodingDensity(self, outDir, scaffolds, genomeSize):
        """Calculate coding density of putative genome bin."""
        gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)
        if os.path.exists(gffFile):
            prodigalParserGFF = ProdigalGeneFeatureParser(gffFile)

            aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA)  # use AA file as nucleotide file is optional
            aaGenes = readFasta(aaFile)

            codingBasePairs = 0  # self.__calculateCodingBases(aaGenes)
            for scaffold_id in scaffolds.keys():
                codingBasePairs += prodigalParserGFF.codingBases(scaffold_id)

            return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes)
        else:
            # there is no gene feature file (perhaps the user specified pre-calculated genes)
            # so calculating the coding density is not possible
            return -1, -1, -1
예제 #2
0
    def calculateCodingDensity(self, outDir, scaffolds, genomeSize):
        """Calculate coding density of putative genome bin."""
        gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)
        if os.path.exists(gffFile):
            prodigalParserGFF = ProdigalGeneFeatureParser(gffFile)

            aaFile = os.path.join(
                outDir, DefaultValues.PRODIGAL_AA
            )  # use AA file as nucleotide file is optional
            aaGenes = readFasta(aaFile)

            codingBasePairs = 0  # self.__calculateCodingBases(aaGenes)
            for scaffold_id in scaffolds.keys():
                codingBasePairs += prodigalParserGFF.codingBases(scaffold_id)

            return float(
                codingBasePairs
            ) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes)
        else:
            # there is no gene feature file (perhaps the user specified pre-calculated genes)
            # so calculating the coding density is not possible
            return -1, -1, -1
예제 #3
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys)
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
예제 #4
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # for simplicity, create a single scaffold from all sequences
            genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna')
            genomeScaffold = 'NNNNNNNNNN'.join(list(seqs.values())).upper()
            fout = open(genomeFile, 'w')
            fout.write('>' + genomeId + '\n')
            fout.write(genomeScaffold)
            fout.close()

            # run prodigal on genome
            ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna')
            gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff')

            cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile))
            os.system(cmd)

            # calculate mean coding density of genome
            numericScaffold = self.__createNumericScaffold(genomeScaffold)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)

            codingBases = prodigalParser.codingBases(genomeId)

            counts = np.bincount(numericScaffold)
            totalBases = counts[0]

            meanCD = float(codingBases) / totalBases

            fout = open('./deltaCD/' + genomeId + '.tsv', 'w')
            fout.write('# Mean CD = ' + str(meanCD) + '\n')

            # calculate coding density distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaCDs = []
                while len(deltaCDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate coding density
                    codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize)
                    counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)])
                    totalBases = counts[0]

                    if totalBases != windowSize:
                        # there are N's in the window so skip it
                        continue

                    cdPer = float(codingBases) / totalBases
                    deltaCDs.append(cdPer - meanCD)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaCDs)) + '\n')
            fout.close()

            queueOut.put(genomeId)
예제 #5
0
    def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
        # parse Prodigal output
        gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
        if not os.path.exists(gffFile):
            self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
            sys.exit()

        prodigalParser = ProdigalGeneFeatureParser(gffFile)

        # Read reference distributions from file
        dist = readDistribution('cd_dist')

        # get coding density for windows
        seqs = readFasta(fastaFile)

        data = []
        seqLens = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.cd_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)

            while(end < seqLen):
                codingBases = prodigalParser.codingBases(seqId, start, end)

                a, c, g, t = baseCount(seq[start:end])
                data.append(float(codingBases) / (a + c + g + t))

                start = end
                end += self.options.cd_window_size

        if len(data) == 0:
            axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
            return

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.cd_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel('% coding density')
        axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        binTools = BinTools()
        meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

        # Delta-CD vs sequence length plot
        axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r')
        axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
        axesDeltaCD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaCD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            closestCD = findNearest(np.array(dist.keys()), meanCD)

            # find closest distribution values
            sampleSeqLen = dist[closestCD].keys()[0]
            d = dist[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0)
            cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0)

            xL = []
            xU = []
            y = []
            for windowSize in dist[closestCD]:
                xL.append(dist[closestCD][windowSize][cdLowerBoundKey])
                xU.append(dist[closestCD][windowSize][cdUpperBoundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            xL = np.array(xL)[sortIndexY]
            xU = np.array(xU)[sortIndexY]
            y = np.array(y)[sortIndexY]
            axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0)
            axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaCD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaCD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaCD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaCD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaCD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaCD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaCD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaCD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
예제 #6
0
    def identifyOutliers(self, outDir, binFiles, tetraProfileFile,
                         distribution, reportType, outputFile):
        """Identify sequences that are outliers."""

        self.logger.info('Reading reference distributions.')
        gcBounds = readDistribution('gc_dist')
        cdBounds = readDistribution('cd_dist')
        tdBounds = readDistribution('td_dist')

        fout = open(outputFile, 'w')
        fout.write(
            'Bin Id\tSequence Id\tSequence length\tOutlying distributions')
        fout.write(
            '\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)'
            % (distribution, distribution))
        fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' %
                   distribution)
        fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' %
                   distribution)

        processedBins = 0
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            processedBins += 1
            self.logger.info('Finding outliers in %s (%d of %d).' %
                             (binId, processedBins, len(binFiles)))

            seqs = readFasta(binFile)

            meanGC, deltaGCs, seqGC = self.gcDist(seqs)

            genomicSig = GenomicSignatures(K=4, threads=1)
            tetraSigs = genomicSig.read(tetraProfileFile)
            binSig = self.binTetraSig(seqs, tetraSigs)
            meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

            gffFile = os.path.join(outDir, 'bins', binId,
                                   DefaultValues.PRODIGAL_GFF)
            if not os.path.exists(gffFile):
                self.logger.error(
                    'Missing gene feature file (%s). This plot if not compatible with the --genes option.\n'
                    % DefaultValues.PRODIGAL_GFF)
                sys.exit(1)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)
            meanCD, deltaCDs, CDs = self.codingDensityDist(
                seqs, prodigalParser)

            # find keys into GC and CD distributions
            closestGC = findNearest(np.array(list(gcBounds.keys())), meanGC)
            sampleSeqLen = list(gcBounds[closestGC].keys())[0]
            d = gcBounds[closestGC][sampleSeqLen]
            gcLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)
            gcUpperBoundKey = findNearest(list(d.keys()),
                                          (100 + distribution) / 2.0)

            closestCD = findNearest(np.array(list(cdBounds.keys())), meanCD)
            sampleSeqLen = list(cdBounds[closestCD].keys())[0]
            d = cdBounds[closestCD][sampleSeqLen]
            cdLowerBoundKey = findNearest(list(d.keys()),
                                          (100 - distribution) / 2.0)

            tdBoundKey = findNearest(
                list(tdBounds[list(tdBounds.keys())[0]].keys()), distribution)

            index = 0
            for seqId, seq in seqs.items():
                seqLen = len(seq)

                # find GC, CD, and TD bounds
                closestSeqLen = findNearest(list(gcBounds[closestGC].keys()),
                                            seqLen)
                gcLowerBound = gcBounds[closestGC][closestSeqLen][
                    gcLowerBoundKey]
                gcUpperBound = gcBounds[closestGC][closestSeqLen][
                    gcUpperBoundKey]

                closestSeqLen = findNearest(list(cdBounds[closestCD].keys()),
                                            seqLen)
                cdLowerBound = cdBounds[closestCD][closestSeqLen][
                    cdLowerBoundKey]

                closestSeqLen = findNearest(list(tdBounds.keys()), seqLen)
                tdBound = tdBounds[closestSeqLen][tdBoundKey]

                outlyingDists = []
                if deltaGCs[index] < gcLowerBound or deltaGCs[
                        index] > gcUpperBound:
                    outlyingDists.append('GC')

                if deltaCDs[index] < cdLowerBound:
                    outlyingDists.append('CD')

                if deltaTDs[index] > tdBound:
                    outlyingDists.append('TD')

                if (reportType == 'any' and len(outlyingDists) >= 1) or (
                        reportType == 'all' and len(outlyingDists) == 3):
                    fout.write(binId + '\t' + seqId + '\t%d' % len(seq) +
                               '\t' + ','.join(outlyingDists))
                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' %
                               (seqGC[index] * 100, meanGC * 100,
                                (meanGC + gcLowerBound) * 100,
                                (meanGC + gcUpperBound) * 100))
                    fout.write('\t%.1f\t%.1f\t%.1f' %
                               (CDs[index] * 100, meanCD * 100,
                                (meanCD + cdLowerBound) * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' %
                               (deltaTDs[index], meanTD, tdBound) + '\n')

                index += 1

        fout.close()
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # for simplicity, create a single scaffold from all sequences
            genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna')
            genomeScaffold = 'NNNNNNNNNN'.join(seqs.values()).upper()
            fout = open(genomeFile, 'w')
            fout.write('>' + genomeId + '\n')
            fout.write(genomeScaffold)
            fout.close()

            # run prodigal on genome
            ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna')
            gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff')

            cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile))
            os.system(cmd)

            # calculate mean coding density of genome
            numericScaffold = self.__createNumericScaffold(genomeScaffold)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)

            codingBases = prodigalParser.codingBases(genomeId)

            counts = np.bincount(numericScaffold)
            totalBases = counts[0]

            meanCD = float(codingBases) / totalBases

            fout = open('./deltaCD/' + genomeId + '.tsv', 'w')
            fout.write('# Mean CD = ' + str(meanCD) + '\n')

            # calculate coding density distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaCDs = []
                while len(deltaCDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate coding density
                    codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize)
                    counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)])
                    totalBases = counts[0]

                    if totalBases != windowSize:
                        # there are N's in the window so skip it
                        continue

                    cdPer = float(codingBases) / totalBases
                    deltaCDs.append(cdPer - meanCD)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaCDs)) + '\n')
            fout.close()

            queueOut.put(genomeId)