def testDistanceMax(self): """Verify computation of distances between genomic signatures.""" gs = GenomicSignatures(K=2, threads=1) sig1 = gs.seqSignature('AAAA') sig2 = gs.seqSignature('GGGG') dist = gs.distance(sig1, sig2) self.assertAlmostEqual(dist, 2.0)
def testDistanceZero(self): """Verify computation of distances between genomic signatures.""" gs = GenomicSignatures(K=2, threads=1) sig1 = gs.seqSignature('AACC') sig2 = gs.seqSignature('AACC') dist = gs.distance(sig1, sig2) self.assertEqual(dist, 0)
def testDistanceMax(self): """Verify computation of distances between genomic signatures.""" gs = GenomicSignatures(K = 2, threads = 1) sig1 = gs.seqSignature('AAAA') sig2 = gs.seqSignature('GGGG') dist = gs.distance(sig1, sig2) self.assertAlmostEqual(dist, 2.0)
def testDistanceZero(self): """Verify computation of distances between genomic signatures.""" gs = GenomicSignatures(K = 2, threads = 1) sig1 = gs.seqSignature('AACC') sig2 = gs.seqSignature('AACC') dist = gs.distance(sig1, sig2) self.assertEqual(dist, 0)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break start = time.time() seqs = readGenomicSeqsFromFasta( os.path.join(genomeDir, genomeId, genomeId + '.fna')) genomeScaffold = 'NNNN'.join(seqs.values()) # calculate tetranucleotide signature of genome gsCalculator = GenomicSignatures(4) genomeSig = gsCalculator.seqSignature(genomeScaffold) fout = open('./deltaTD/' + genomeId + '.tsv', 'w') fout.write('# Tetra signature = ' + str(genomeSig) + '\n') fout.close() sys.exit() # calculate tetranucleotide distance distribution for different window sizes startW = time.time() for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaTDs = [] while len(deltaTDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) windowSig = gsCalculator.seqSignature( genomeScaffold[startWindow:(startWindow + windowSize)]) dist = gsCalculator.distance(genomeSig, windowSig) deltaTDs.append(dist) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaTDs)) + '\n') fout.close() endW = time.time() print endW - startW queueOut.put(genomeId) end = time.time() print end - start
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break start = time.time() seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) genomeScaffold = 'NNNN'.join(seqs.values()) # calculate tetranucleotide signature of genome gsCalculator = GenomicSignatures(4) genomeSig = gsCalculator.seqSignature(genomeScaffold) fout = open('./deltaTD/' + genomeId + '.tsv', 'w') fout.write('# Tetra signature = ' + str(genomeSig) + '\n') fout.close() sys.exit() # calculate tetranucleotide distance distribution for different window sizes startW = time.time() for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaTDs = [] while len(deltaTDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) windowSig = gsCalculator.seqSignature(genomeScaffold[startWindow:(startWindow+windowSize)]) dist = gsCalculator.distance(genomeSig, windowSig) deltaTDs.append(dist) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaTDs)) + '\n') fout.close() endW = time.time() print endW - startW queueOut.put(genomeId) end = time.time() print end - start
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution('td_dist') # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while (end < seqLen): windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r'$\Delta$ TD') axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap='gray_r') axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD) axesDeltaTD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1] ) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution("td_dist") # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while end < seqLen: windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r"$\Delta$ TD") axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)") # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r") axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD) axesDeltaTD.set_ylabel("Sequence length (kbp)") _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = "%.1f" % (float(seqLen) / 1000) label = label.replace(".0", "") # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour)