def calculateGC(self, seqs, seqStats): """Calculate fraction of nucleotides that are G or C.""" totalGC = 0 totalAT = 0 gcPerSeq = [] for seqId, seq in seqs.iteritems(): a, c, g, t = baseCount(seq) gc = g + c at = a + t totalGC += gc totalAT += at if (gc + at) > 0: gcContent = float(gc) / (gc + at) else: gcContent = 0.0 seqStats[seqId]['GC'] = gcContent if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD: gcPerSeq.append(gcContent) if (totalGC + totalAT) > 0: GC = float(totalGC) / (totalGC + totalAT) else: GC = 0.0 varGC = 0 if len(gcPerSeq) > 1: varGC = mean(map(lambda x: (x - GC) ** 2, gcPerSeq)) return GC, math.sqrt(varGC)
def calculateGC(self, seqs, seqStats=None): """Calculate fraction of nucleotides that are G or C.""" totalGC = 0 totalAT = 0 gcPerSeq = [] for seqId, seq in seqs.items(): a, c, g, t = baseCount(seq) gc = g + c at = a + t totalGC += gc totalAT += at if (gc + at) > 0: gcContent = float(gc) / (gc + at) else: gcContent = 0.0 if seqStats: seqStats[seqId]['GC'] = gcContent if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD: gcPerSeq.append(gcContent) if (totalGC + totalAT) > 0: GC = float(totalGC) / (totalGC + totalAT) else: GC = 0.0 varGC = 0 if len(gcPerSeq) > 1: varGC = mean(list(map(lambda x: (x - GC)**2, gcPerSeq))) return GC, math.sqrt(varGC)
def testBaseCount(self): """Verify computation of base count on mixed-case sequence.""" a, c, g, t = baseCount('ACGTacgtNnUu') self.assertEqual(a, 2) self.assertEqual(c, 2) self.assertEqual(g, 2) self.assertEqual(t, 4)
def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen): checkFileExists(seqFile) # get list of sequences in bins self.logger.info(' Reading binned sequences.') binnedSeqs = {} totalBinnedBases = 0 for binFile in binFiles: seqs = readFasta(binFile) binnedSeqs.update(seqs) for seq in seqs.values(): totalBinnedBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6)) # get list of all sequences self.logger.info(' Reading all sequences.') allSeqs = readFasta(seqFile) totalBases = 0 for seq in allSeqs.values(): totalBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6)) # write all unbinned sequences self.logger.info(' Identifying unbinned sequences >= %d bp.' % minSeqLen) seqOut = open(outSeqFile, 'w') statsOut = open(outStatsFile, 'w') statsOut.write('Sequence Id\tLength\tGC\n') unbinnedCount = 0 unbinnedBases = 0 for seqId, seq in allSeqs.iteritems(): if seqId not in binnedSeqs: if len(seq) >= minSeqLen: unbinnedCount += 1 seqOut.write('>' + seqId + '\n') seqOut.write(seq + '\n') unbinnedBases += len(seq) a, c, g, t = baseCount(seq) statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t))) seqOut.close() statsOut.close() self.logger.info(' Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6)) self.logger.info('') self.logger.info(' Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs))) self.logger.info(' Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen): checkFileExists(seqFile) # get list of sequences in bins self.logger.info('Reading binned sequences.') binnedSeqs = {} totalBinnedBases = 0 for binFile in binFiles: seqs = readFasta(binFile) binnedSeqs.update(seqs) for seq in seqs.values(): totalBinnedBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6)) # get list of all sequences self.logger.info('Reading all sequences.') allSeqs = readFasta(seqFile) totalBases = 0 for seq in allSeqs.values(): totalBases += len(seq) self.logger.info(' Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6)) # write all unbinned sequences self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen) seqOut = open(outSeqFile, 'w') statsOut = open(outStatsFile, 'w') statsOut.write('Sequence Id\tLength\tGC\n') unbinnedCount = 0 unbinnedBases = 0 for seqId, seq in allSeqs.iteritems(): if seqId not in binnedSeqs: if len(seq) >= minSeqLen: unbinnedCount += 1 seqOut.write('>' + seqId + '\n') seqOut.write(seq + '\n') unbinnedBases += len(seq) a, c, g, t = baseCount(seq) statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t))) seqOut.close() statsOut.close() self.logger.info(' Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6)) self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs))) self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
def gcDist(self, seqs): """GC statistics for bin.""" GCs = [] gcTotal = 0 basesTotal = 0 for _, seq in seqs.items(): a, c, g, t = baseCount(seq) gc = g + c bases = a + c + g + t GCs.append(float(gc) / (bases)) gcTotal += gc basesTotal += bases meanGC = float(gcTotal) / basesTotal deltaGCs = np.array(GCs) - meanGC return meanGC, deltaGCs, GCs
def gcDist(self, seqs): """GC statistics for bin.""" GCs = [] gcTotal = 0 basesTotal = 0 for _, seq in seqs.iteritems(): a, c, g, t = baseCount(seq) gc = g + c bases = a + c + g + t GCs.append(float(gc) / (bases)) gcTotal += gc basesTotal += bases meanGC = float(gcTotal) / basesTotal deltaGCs = np.array(GCs) - meanGC return meanGC, deltaGCs, GCs
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while (end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel( '[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap='gray_r') axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r') axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes): # get GC for windows seqs = readFasta(binFile) gcProfile = {} for seqId, seq in seqs.iteritems(): start = 0 end = self.options.window_size windowGCs = [] while(end < len(seq)): a, c, g, t = baseCount(seq[start:end]) windowGCs.append(float(g + c) / (a + c + g + t)) start = end end += self.options.window_size a, c, g, t = baseCount(seq) seqGC = float(g + c) / (a + c + g + t) gcProfile[seqId] = [seqGC, windowGCs] # plot GC vs coverage for windows gc = [] coverage = [] for seqId, gcInfo in gcProfile.iteritems(): gc += gcInfo[1] coverage += coverageProfile[seqId][1] windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap=pylab.cm.Greys) windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100)) windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) # plot linear regression line if len(gc) > 1: slope, inter = polyfit(gc, coverage,1) fit_fn = poly1d([slope, inter]) # fit_fn is now a function which takes in x and returns an estimate for y windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5) windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope)) else: # not possible to calculate best fit line windowAxes.set_title('GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size) # Prettify plot for a in windowAxes.yaxis.majorTicks: a.tick1On=True a.tick2On=False for a in windowAxes.xaxis.majorTicks: a.tick1On=True a.tick2On=False for line in windowAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in windowAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in windowAxes.spines.iteritems(): if loc in ['right','top']: spine.set_color('none') else: spine.set_color(self.axesColour) # plot GC vs coverage for entire sequences gc = [] coverage = [] seqLen = [] for seqId, gcInfo in gcProfile.iteritems(): gc.append(gcInfo[0]) coverage.append(coverageProfile[seqId][0]) seqLen.append(len(seqs[seqId])) # set marker size proportional to sequence length markerSize = log(array(seqLen)) # log-scale markerSize = (markerSize - min(markerSize)) / max(markerSize) # normalize between 0 and 1 markerSize = markerSize*200 + 10 # normalize between 10 and 200 seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap=pylab.cm.Greys) seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc)*100)) seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) seqAxes.set_title('GC vs. Coverage\nIndividual Sequences') # Prettify plot for a in seqAxes.yaxis.majorTicks: a.tick1On=True a.tick2On=False for a in seqAxes.xaxis.majorTicks: a.tick1On=True a.tick2On=False for line in seqAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in seqAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in seqAxes.spines.iteritems(): if loc in ['right','top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # Read reference distributions from file dist = readDistribution('gc_dist') # get GC for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for _, seq in seqs.iteritems(): start = 0 end = self.options.gc_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): a, c, g, t = baseCount(seq[start:end]) try: data.append(float(g + c) / (a + c + g + t)) except: # it is possible to reach a long stretch of # N's that causes a division by zero error pass start = end end += self.options.gc_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.gc_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get GC bin statistics binTools = BinTools() meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaGC.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestGC = findNearest(np.array(dist.keys()), meanGC) # find closest distribution values sampleSeqLen = dist[closestGC].keys()[0] d = dist[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestGC]: xL.append(dist[closestGC][windowSize][gcLowerBoundKey]) xU.append(dist[closestGC][windowSize][gcUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaGC.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaGC.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaGC.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaGC.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaGC.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaGC.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaGC.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaGC.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def plotOnAxes(self, binFile, coverageProfile, windowAxes, seqAxes): # get GC for windows seqs = readFasta(binFile) gcProfile = {} for seqId, seq in seqs.items(): start = 0 end = self.options.window_size windowGCs = [] while (end < len(seq)): a, c, g, t = baseCount(seq[start:end]) windowGCs.append(float(g + c) / (a + c + g + t)) start = end end += self.options.window_size a, c, g, t = baseCount(seq) seqGC = float(g + c) / (a + c + g + t) gcProfile[seqId] = [seqGC, windowGCs] # plot GC vs coverage for windows gc = [] coverage = [] for seqId, gcInfo in gcProfile.items(): gc += gcInfo[1] coverage += coverageProfile[seqId][1] windowAxes.scatter(gc, coverage, c=abs(array(coverage)), s=10, lw=0.5, cmap='gray_r') windowAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100)) windowAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) # plot linear regression line if len(gc) > 1: slope, inter = polyfit(gc, coverage, 1) fit_fn = poly1d( [slope, inter] ) # fit_fn is now a function which takes in x and returns an estimate for y windowAxes.plot([min(gc), max(gc)], fit_fn([min(gc), max(gc)]), '--r', lw=0.5) windowAxes.set_title( 'GC vs. Coverage\n(window size = %d bp, slope = %.2f)' % (self.options.window_size, slope)) else: # not possible to calculate best fit line windowAxes.set_title( 'GC vs. Coverage\n(window size = %d bp, no best fit line)' % self.options.window_size) # Prettify plot for a in windowAxes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in windowAxes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in windowAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in windowAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in windowAxes.spines.items(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # plot GC vs coverage for entire sequences gc = [] coverage = [] seqLen = [] for seqId, gcInfo in gcProfile.items(): gc.append(gcInfo[0]) coverage.append(coverageProfile[seqId][0]) seqLen.append(len(seqs[seqId])) # set marker size proportional to sequence length markerSize = log(array(seqLen)) # log-scale markerSize = (markerSize - min(markerSize)) / max( markerSize) # normalize between 0 and 1 markerSize = markerSize * 200 + 10 # normalize between 10 and 200 seqAxes.scatter(gc, coverage, c=abs(array(coverage)), s=markerSize, lw=0.5, cmap='gray_r') seqAxes.set_xlabel('GC (mean = %.1f%%)' % (mean(gc) * 100)) seqAxes.set_ylabel('Coverage (mean = %.1f)' % mean(coverage)) seqAxes.set_title('GC vs. Coverage\nIndividual Sequences') # Prettify plot for a in seqAxes.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in seqAxes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in seqAxes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in seqAxes.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in seqAxes.spines.items(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)