def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while (end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap='gray_r') axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution('td_dist') # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while (end < seqLen): windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r'$\Delta$ TD') axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap='gray_r') axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD) axesDeltaTD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1] ) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r') axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD): # Read reference distributions from file dist = readDistribution("td_dist") # get tetranucleotide signature for bin seqs = readFasta(fastaFile) binTools = BinTools() binSig = binTools.binTetraSig(seqs, tetraSigs) # get tetranucleotide distances for windows genomicSig = GenomicSignatures(K=4, threads=1) data = [] seqLens = [] deltaTDs = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.td_window_size seqLen = len(seq) seqLens.append(seqLen) deltaTDs.append(genomicSig.distance(tetraSigs[seqId], binSig)) while end < seqLen: windowSig = genomicSig.seqSignature(seq[start:end]) data.append(genomicSig.distance(windowSig, binSig)) start = end end += self.options.td_window_size if len(data) == 0: axesHist.set_xlabel("[Error] No seqs >= %d, the specified window size" % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) # Histogram plot bins = [0.0] binWidth = self.options.td_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r"$\Delta$ TD") axesHist.set_ylabel("% windows (" + str(self.options.td_window_size) + " bp)") # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour) # get CD bin statistics meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap="gray_r") axesDeltaTD.set_xlabel(r"$\Delta$ TD (mean TD = %.2f)" % meanTD) axesDeltaTD.set_ylabel("Sequence length (kbp)") _, yMaxSeqs = axesDeltaTD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaTD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: boundKey = findNearest(dist[dist.keys()[0]].keys(), distToPlot) x = [] y = [] for windowSize in dist: x.append(dist[windowSize][boundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) x = np.array(x)[sortIndexY] y = np.array(y)[sortIndexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axesDeltaTD.plot(x, y, "r--", lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaTD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle="dashed", color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = "%.1f" % (float(seqLen) / 1000) label = label.replace(".0", "") # remove trailing zero kbpLabels.append(label) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaTD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaTD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaTD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaTD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaTD.spines.iteritems(): if loc in ["right", "top"]: spine.set_color("none") else: spine.set_color(self.axesColour)
def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile): """Identify sequences that are outliers.""" self.logger.info('Reading reference distributions.') gcBounds = readDistribution('gc_dist') cdBounds = readDistribution('cd_dist') tdBounds = readDistribution('td_dist') fout = open(outputFile, 'w') fout.write( 'Bin Id\tSequence Id\tSequence length\tOutlying distributions') fout.write( '\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution)) fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution) fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution) processedBins = 0 for binFile in binFiles: binId = binIdFromFilename(binFile) processedBins += 1 self.logger.info('Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles))) seqs = readFasta(binFile) meanGC, deltaGCs, seqGC = self.gcDist(seqs) genomicSig = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSig.read(tetraProfileFile) binSig = self.binTetraSig(seqs, tetraSigs) meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error( 'Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF) sys.exit(1) prodigalParser = ProdigalGeneFeatureParser(gffFile) meanCD, deltaCDs, CDs = self.codingDensityDist( seqs, prodigalParser) # find keys into GC and CD distributions closestGC = findNearest(np.array(list(gcBounds.keys())), meanGC) sampleSeqLen = list(gcBounds[closestGC].keys())[0] d = gcBounds[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(list(d.keys()), (100 - distribution) / 2.0) gcUpperBoundKey = findNearest(list(d.keys()), (100 + distribution) / 2.0) closestCD = findNearest(np.array(list(cdBounds.keys())), meanCD) sampleSeqLen = list(cdBounds[closestCD].keys())[0] d = cdBounds[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(list(d.keys()), (100 - distribution) / 2.0) tdBoundKey = findNearest( list(tdBounds[list(tdBounds.keys())[0]].keys()), distribution) index = 0 for seqId, seq in seqs.items(): seqLen = len(seq) # find GC, CD, and TD bounds closestSeqLen = findNearest(list(gcBounds[closestGC].keys()), seqLen) gcLowerBound = gcBounds[closestGC][closestSeqLen][ gcLowerBoundKey] gcUpperBound = gcBounds[closestGC][closestSeqLen][ gcUpperBoundKey] closestSeqLen = findNearest(list(cdBounds[closestCD].keys()), seqLen) cdLowerBound = cdBounds[closestCD][closestSeqLen][ cdLowerBoundKey] closestSeqLen = findNearest(list(tdBounds.keys()), seqLen) tdBound = tdBounds[closestSeqLen][tdBoundKey] outlyingDists = [] if deltaGCs[index] < gcLowerBound or deltaGCs[ index] > gcUpperBound: outlyingDists.append('GC') if deltaCDs[index] < cdLowerBound: outlyingDists.append('CD') if deltaTDs[index] > tdBound: outlyingDists.append('TD') if (reportType == 'any' and len(outlyingDists) >= 1) or ( reportType == 'all' and len(outlyingDists) == 3): fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists)) fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100)) fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n') index += 1 fout.close()