def testDistanceMax(self):
        """Verify computation of distances between genomic signatures."""
        gs = GenomicSignatures(K=2, threads=1)

        sig1 = gs.seqSignature('AAAA')
        sig2 = gs.seqSignature('GGGG')

        dist = gs.distance(sig1, sig2)

        self.assertAlmostEqual(dist, 2.0)
    def testDistanceZero(self):
        """Verify computation of distances between genomic signatures."""
        gs = GenomicSignatures(K=2, threads=1)

        sig1 = gs.seqSignature('AACC')
        sig2 = gs.seqSignature('AACC')

        dist = gs.distance(sig1, sig2)

        self.assertEqual(dist, 0)
Ejemplo n.º 3
0
 def testDistanceMax(self):
     """Verify computation of distances between genomic signatures."""
     gs = GenomicSignatures(K = 2, threads = 1)
     
     sig1 = gs.seqSignature('AAAA')
     sig2 = gs.seqSignature('GGGG')
     
     dist = gs.distance(sig1, sig2)
     
     self.assertAlmostEqual(dist, 2.0)
Ejemplo n.º 4
0
 def testDistanceZero(self):
     """Verify computation of distances between genomic signatures."""
     gs = GenomicSignatures(K = 2, threads = 1)
     
     sig1 = gs.seqSignature('AACC')
     sig2 = gs.seqSignature('AACC')
     
     dist = gs.distance(sig1, sig2)
     
     self.assertEqual(dist, 0)
Ejemplo n.º 5
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn,
                           queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            start = time.time()

            seqs = readGenomicSeqsFromFasta(
                os.path.join(genomeDir, genomeId, genomeId + '.fna'))
            genomeScaffold = 'NNNN'.join(seqs.values())

            # calculate tetranucleotide signature of genome
            gsCalculator = GenomicSignatures(4)
            genomeSig = gsCalculator.seqSignature(genomeScaffold)

            fout = open('./deltaTD/' + genomeId + '.tsv', 'w')
            fout.write('# Tetra signature = ' + str(genomeSig) + '\n')
            fout.close()
            sys.exit()

            # calculate tetranucleotide distance distribution for different window sizes
            startW = time.time()
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaTDs = []
                while len(deltaTDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    windowSig = gsCalculator.seqSignature(
                        genomeScaffold[startWindow:(startWindow + windowSize)])
                    dist = gsCalculator.distance(genomeSig, windowSig)
                    deltaTDs.append(dist)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaTDs)) + '\n')
            fout.close()
            endW = time.time()
            print endW - startW

            queueOut.put(genomeId)

            end = time.time()
            print end - start
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            start = time.time()

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))
            genomeScaffold = 'NNNN'.join(seqs.values())

            # calculate tetranucleotide signature of genome
            gsCalculator = GenomicSignatures(4)
            genomeSig = gsCalculator.seqSignature(genomeScaffold)

            fout = open('./deltaTD/' + genomeId + '.tsv', 'w')
            fout.write('# Tetra signature = ' + str(genomeSig) + '\n')
            fout.close()
            sys.exit()

            # calculate tetranucleotide distance distribution for different window sizes
            startW = time.time()
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaTDs = []
                while len(deltaTDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    windowSig = gsCalculator.seqSignature(genomeScaffold[startWindow:(startWindow+windowSize)])
                    dist = gsCalculator.distance(genomeSig, windowSig)
                    deltaTDs.append(dist)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaTDs)) + '\n')
            fout.close()
            endW = time.time()
            print endW - startW

            queueOut.put(genomeId)

            end = time.time()
            print end - start
Ejemplo n.º 7
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist,
                   axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution('td_dist')

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while (end < seqLen):
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel(
                '[Error] No seqs >= %d, the specified window size' %
                self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r'$\Delta$ TD')
        axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) +
                            ' bp)')

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs,
                                                  binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs,
                            seqLens,
                            c=abs(deltaTDs),
                            s=10,
                            lw=0.5,
                            cmap='gray_r')
        axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD)
        axesDeltaTD.set_ylabel('Sequence length (kbp)')

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]
                                    ) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0,
                           0,
                           yMaxSeqs,
                           linestyle='dashed',
                           color=self.axesColour,
                           zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = '%.1f' % (float(seqLen) / 1000)
            label = label.replace('.0', '')  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ['right', 'top']:
                spine.set_color('none')
            else:
                spine.set_color(self.axesColour)
Ejemplo n.º 8
0
    def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD):
        # Read reference distributions from file
        dist = readDistribution("td_dist")

        # get tetranucleotide signature for bin
        seqs = readFasta(fastaFile)

        binTools = BinTools()
        binSig = binTools.binTetraSig(seqs, tetraSigs)

        # get tetranucleotide distances for windows
        genomicSig = GenomicSignatures(K=4, threads=1)

        data = []
        seqLens = []
        deltaTDs = []
        for seqId, seq in seqs.iteritems():
            start = 0
            end = self.options.td_window_size

            seqLen = len(seq)
            seqLens.append(seqLen)
            deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig))

            while end < seqLen:
                windowSig = genomicSig.seqSignature(seq[start:end])
                data.append(genomicSig.distance(windowSig, binSig))

                start = end
                end += self.options.td_window_size

        if len(data) == 0:
            axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size)
            return

        deltaTDs = np.array(deltaTDs)

        # Histogram plot
        bins = [0.0]
        binWidth = self.options.td_bin_width
        binEnd = binWidth
        while binEnd <= 1.0:
            bins.append(binEnd)
            binEnd += binWidth

        axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5))
        axesHist.set_xlabel(r"$\Delta$ TD")
        axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)")

        # Prettify plot
        for a in axesHist.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesHist.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesHist.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesHist.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesHist.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)

        # get CD bin statistics
        meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)

        # Delta-TD vs Sequence length plot
        axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r")
        axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD)
        axesDeltaTD.set_ylabel("Sequence length (kbp)")

        _, yMaxSeqs = axesDeltaTD.get_ylim()
        xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim()

        # plot reference distributions
        for distToPlot in distributionsToPlot:
            boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot)

            x = []
            y = []
            for windowSize in dist:
                x.append(dist[windowSize][boundKey])
                y.append(windowSize)

            # sort by y-values
            sortIndexY = np.argsort(y)
            x = np.array(x)[sortIndexY]
            y = np.array(y)[sortIndexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axesDeltaTD.set_ylim([0, yMaxSeqs])

        # ensure x-axis is set appropriately for sequences
        axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

        # draw vertical line at x=0
        axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0)

        # Change sequence lengths from bp to kbp
        yticks = axesDeltaTD.get_yticks()
        kbpLabels = []
        for seqLen in yticks:
            label = "%.1f" % (float(seqLen) / 1000)
            label = label.replace(".0", "")  # remove trailing zero
            kbpLabels.append(label)
        axesDeltaTD.set_yticklabels(kbpLabels)

        # Prettify plot
        for a in axesDeltaTD.yaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for a in axesDeltaTD.xaxis.majorTicks:
            a.tick1On = True
            a.tick2On = False

        for line in axesDeltaTD.yaxis.get_ticklines():
            line.set_color(self.axesColour)

        for line in axesDeltaTD.xaxis.get_ticklines():
            line.set_color(self.axesColour)

        for loc, spine in axesDeltaTD.spines.iteritems():
            if loc in ["right", "top"]:
                spine.set_color("none")
            else:
                spine.set_color(self.axesColour)