def distributionPlots(self, options): """Reference distribution plot command""" self.logger.info( '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) plots = DistributionPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 binId = binIdFromFilename(f) plots.plot(f, tetraSigs, options.distributions) outputFile = os.path.join( options.output_dir, binId) + '.ref_dist_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile): """Identify sequences that are outliers.""" self.logger.info('Reading reference distributions.') gcBounds = readDistribution('gc_dist') cdBounds = readDistribution('cd_dist') tdBounds = readDistribution('td_dist') fout = open(outputFile, 'w') fout.write( 'Bin Id\tSequence Id\tSequence length\tOutlying distributions') fout.write( '\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution)) fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution) fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution) processedBins = 0 for binFile in binFiles: binId = binIdFromFilename(binFile) processedBins += 1 self.logger.info('Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles))) seqs = readFasta(binFile) meanGC, deltaGCs, seqGC = self.gcDist(seqs) genomicSig = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSig.read(tetraProfileFile) binSig = self.binTetraSig(seqs, tetraSigs) meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error( 'Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF) sys.exit(1) prodigalParser = ProdigalGeneFeatureParser(gffFile) meanCD, deltaCDs, CDs = self.codingDensityDist( seqs, prodigalParser) # find keys into GC and CD distributions closestGC = findNearest(np.array(list(gcBounds.keys())), meanGC) sampleSeqLen = list(gcBounds[closestGC].keys())[0] d = gcBounds[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(list(d.keys()), (100 - distribution) / 2.0) gcUpperBoundKey = findNearest(list(d.keys()), (100 + distribution) / 2.0) closestCD = findNearest(np.array(list(cdBounds.keys())), meanCD) sampleSeqLen = list(cdBounds[closestCD].keys())[0] d = cdBounds[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(list(d.keys()), (100 - distribution) / 2.0) tdBoundKey = findNearest( list(tdBounds[list(tdBounds.keys())[0]].keys()), distribution) index = 0 for seqId, seq in seqs.items(): seqLen = len(seq) # find GC, CD, and TD bounds closestSeqLen = findNearest(list(gcBounds[closestGC].keys()), seqLen) gcLowerBound = gcBounds[closestGC][closestSeqLen][ gcLowerBoundKey] gcUpperBound = gcBounds[closestGC][closestSeqLen][ gcUpperBoundKey] closestSeqLen = findNearest(list(cdBounds[closestCD].keys()), seqLen) cdLowerBound = cdBounds[closestCD][closestSeqLen][ cdLowerBoundKey] closestSeqLen = findNearest(list(tdBounds.keys()), seqLen) tdBound = tdBounds[closestSeqLen][tdBoundKey] outlyingDists = [] if deltaGCs[index] < gcLowerBound or deltaGCs[ index] > gcUpperBound: outlyingDists.append('GC') if deltaCDs[index] < cdLowerBound: outlyingDists.append('CD') if deltaTDs[index] > tdBound: outlyingDists.append('TD') if (reportType == 'any' and len(outlyingDists) >= 1) or ( reportType == 'all' and len(outlyingDists) == 3): fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists)) fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100)) fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n') index += 1 fout.close()
def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile): """Identify sequences that are outliers.""" self.logger.info(' Reading reference distributions.') gcBounds = readDistribution('gc_dist') cdBounds = readDistribution('cd_dist') tdBounds = readDistribution('td_dist') fout = open(outputFile, 'w') fout.write('Bin Id\tSequence Id\tSequence length\tOutlying distributions') fout.write('\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution)) fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution) fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution) self.logger.info('') processedBins = 0 for binFile in binFiles: binId = binIdFromFilename(binFile) processedBins += 1 self.logger.info(' Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles))) seqs = readFasta(binFile) meanGC, deltaGCs, seqGC = self.gcDist(seqs) genomicSig = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSig.read(tetraProfileFile) binSig = self.binTetraSig(seqs, tetraSigs) meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error(' [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF) sys.exit(1) prodigalParser = ProdigalGeneFeatureParser(gffFile) meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser) # find keys into GC and CD distributions closestGC = findNearest(np.array(gcBounds.keys()), meanGC) sampleSeqLen = gcBounds[closestGC].keys()[0] d = gcBounds[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0) closestCD = findNearest(np.array(cdBounds.keys()), meanCD) sampleSeqLen = cdBounds[closestCD].keys()[0] d = cdBounds[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0) tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution) index = 0 for seqId, seq in seqs.iteritems(): seqLen = len(seq) # find GC, CD, and TD bounds closestSeqLen = findNearest(gcBounds[closestGC].keys(), seqLen) gcLowerBound = gcBounds[closestGC][closestSeqLen][gcLowerBoundKey] gcUpperBound = gcBounds[closestGC][closestSeqLen][gcUpperBoundKey] closestSeqLen = findNearest(cdBounds[closestCD].keys(), seqLen) cdLowerBound = cdBounds[closestCD][closestSeqLen][cdLowerBoundKey] closestSeqLen = findNearest(tdBounds.keys(), seqLen) tdBound = tdBounds[closestSeqLen][tdBoundKey] outlyingDists = [] if deltaGCs[index] < gcLowerBound or deltaGCs[index] > gcUpperBound: outlyingDists.append('GC') if deltaCDs[index] < cdLowerBound: outlyingDists.append('CD') if deltaTDs[index] > tdBound: outlyingDists.append('TD') if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3): fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists)) fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100)) fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n') index += 1 fout.close()