def calculateCodingDensity(self, outDir, scaffolds, genomeSize): """Calculate coding density of putative genome bin.""" gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF) if os.path.exists(gffFile): prodigalParserGFF = ProdigalGeneFeatureParser(gffFile) aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA) # use AA file as nucleotide file is optional aaGenes = readFasta(aaFile) codingBasePairs = 0 # self.__calculateCodingBases(aaGenes) for scaffold_id in scaffolds.keys(): codingBasePairs += prodigalParserGFF.codingBases(scaffold_id) return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes) else: # there is no gene feature file (perhaps the user specified pre-calculated genes) # so calculating the coding density is not possible return -1, -1, -1
def calculateCodingDensity(self, outDir, scaffolds, genomeSize): """Calculate coding density of putative genome bin.""" gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF) if os.path.exists(gffFile): prodigalParserGFF = ProdigalGeneFeatureParser(gffFile) aaFile = os.path.join( outDir, DefaultValues.PRODIGAL_AA ) # use AA file as nucleotide file is optional aaGenes = readFasta(aaFile) codingBasePairs = 0 # self.__calculateCodingBases(aaGenes) for scaffold_id in scaffolds.keys(): codingBasePairs += prodigalParserGFF.codingBases(scaffold_id) return float( codingBasePairs ) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes) else: # there is no gene feature file (perhaps the user specified pre-calculated genes) # so calculating the coding density is not possible return -1, -1, -1
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) # for simplicity, create a single scaffold from all sequences genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna') genomeScaffold = 'NNNNNNNNNN'.join(list(seqs.values())).upper() fout = open(genomeFile, 'w') fout.write('>' + genomeId + '\n') fout.write(genomeScaffold) fout.close() # run prodigal on genome ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna') gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff') cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile)) os.system(cmd) # calculate mean coding density of genome numericScaffold = self.__createNumericScaffold(genomeScaffold) prodigalParser = ProdigalGeneFeatureParser(gffFile) codingBases = prodigalParser.codingBases(genomeId) counts = np.bincount(numericScaffold) totalBases = counts[0] meanCD = float(codingBases) / totalBases fout = open('./deltaCD/' + genomeId + '.tsv', 'w') fout.write('# Mean CD = ' + str(meanCD) + '\n') # calculate coding density distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaCDs = [] while len(deltaCDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate coding density codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize) counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)]) totalBases = counts[0] if totalBases != windowSize: # there are N's in the window so skip it continue cdPer = float(codingBases) / totalBases deltaCDs.append(cdPer - meanCD) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaCDs)) + '\n') fout.close() queueOut.put(genomeId)
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r') axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) # for simplicity, create a single scaffold from all sequences genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna') genomeScaffold = 'NNNNNNNNNN'.join(seqs.values()).upper() fout = open(genomeFile, 'w') fout.write('>' + genomeId + '\n') fout.write(genomeScaffold) fout.close() # run prodigal on genome ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna') gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff') cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile)) os.system(cmd) # calculate mean coding density of genome numericScaffold = self.__createNumericScaffold(genomeScaffold) prodigalParser = ProdigalGeneFeatureParser(gffFile) codingBases = prodigalParser.codingBases(genomeId) counts = np.bincount(numericScaffold) totalBases = counts[0] meanCD = float(codingBases) / totalBases fout = open('./deltaCD/' + genomeId + '.tsv', 'w') fout.write('# Mean CD = ' + str(meanCD) + '\n') # calculate coding density distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaCDs = [] while len(deltaCDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate coding density codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize) counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)]) totalBases = counts[0] if totalBases != windowSize: # there are N's in the window so skip it continue cdPer = float(codingBases) / totalBases deltaCDs.append(cdPer - meanCD) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaCDs)) + '\n') fout.close() queueOut.put(genomeId)