def nxPlot(self, options): """Nx-plot command""" self.logger.info('[CheckM - nx_plot] Creating Nx-plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) nx = NxPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info('Plotting Nx-plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 nx.plot(f) outputFile = os.path.join(options.output_dir, binId) + '.nx_plot.' + options.image_type nx.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def distributionPlots(self, options): """Reference distribution plot command""" self.logger.info( '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) plots = DistributionPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 binId = binIdFromFilename(f) plots.plot(f, tetraSigs, options.distributions) outputFile = os.path.join( options.output_dir, binId) + '.ref_dist_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def modify(self, options): """Modify command""" self.logger.info('[CheckM - modify] Modifying sequences in bin.') makeSurePathExists(os.path.dirname(options.output_file)) if not (options.add or options.remove or options.outlier_file): self.logger.error('No modification to bin requested.\n') sys.exit(1) if (options.add or options.remove) and options.outlier_file: self.logger.error( "The 'outlier_file' option cannot be specified with 'add' or 'remove'.\n" ) sys.exit(1) binTools = BinTools() if options.add or options.remove: binTools.modify(options.bin_file, options.seq_file, options.add, options.remove, options.output_file) elif options.outlier_file: binTools.removeOutliers(options.bin_file, options.outlier_file, options.output_file) self.logger.info('Modified bin written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut): """Create multiple sequence alignment for markers with multiple hits in a bin.""" HF = HMMERRunner(mode='fetch') while True: binId = queueIn.get(block=True, timeout=None) if binId == None: break markersWithMultipleHits = self.__extractMarkersWithMultipleHits(outDir, binId, resultsParser, binIdToBinMarkerSets[binId]) if len(markersWithMultipleHits) != 0: # create multiple sequence alignments for markers with multiple hits binAlignOutputDir = os.path.join(alignOutputDir, binId) makeSurePathExists(binAlignOutputDir) for markerId in markersWithMultipleHits: tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) HF.fetch(hmmModelFile, markerId, tempModelFile) self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False) os.remove(tempModelFile) queueOut.put(binId)
def codingDensityPlot(self, options): """Coding density plot command""" self.logger.info( '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plots = CodingDensityPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting coding density plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, options.distributions) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.coding_density_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut): """Create multiple sequence alignment for markers with multiple hits in a bin.""" HF = HMMERRunner(mode='fetch') while True: binId = queueIn.get(block=True, timeout=None) if binId == None: break markersWithMultipleHits = self.__extractMarkersWithMultipleHits( outDir, binId, resultsParser, binIdToBinMarkerSets[binId]) if len(markersWithMultipleHits) != 0: # create multiple sequence alignments for markers with multiple hits binAlignOutputDir = os.path.join(alignOutputDir, binId) makeSurePathExists(binAlignOutputDir) for markerId in markersWithMultipleHits: tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) HF.fetch(hmmModelFile, markerId, tempModelFile) self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False) os.remove(tempModelFile) queueOut.put(binId)
def lengthHistogram(self, options): """Sequence length histogram command""" self.logger.info( '[CheckM - len_hist] Creating sequence length histogram.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plot = LengthHistogram(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting sequence length histogram for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(f) outputFile = os.path.join( options.output_dir, binId) + '.len_hist.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def coveragePcaPlot(self, options): """PCA plot of coverage profiles""" self.logger.info( '[CheckM - cov_pca] Creating PCA plot of coverage profiles.') checkDirExists(options.bin_dir) checkFileExists(options.coverage_file) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) seqIds = [] coverageProfiles = [] for binId, seqDict in coverageStats.items(): for seqId, bamDict in seqDict.items(): seqIds.append(seqId) coverages = [] for _, coverage in bamDict.items(): coverages.append(coverage) coverageProfiles.append(coverages) coverageProfiles = np.array(coverageProfiles) if coverageProfiles.shape[1] < 2: self.logger.error( 'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.' ) sys.exit(1) self.logger.info('Computing PCA of coverage profiles.\n') pca = PCA() pc, variance = pca.pcaMatrix(coverageProfiles, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of coverage profiles for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.cov_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def tree(self, options): """Tree command""" self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) # find phylogenetically informative genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_PHYLO_OUT, DefaultValues.HMMER_PHYLO_OUT, DefaultValues.PHYLO_HMM_MODELS, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) # write model information to file markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) # align identified marker genes HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentToPhyloMarkers( options.output_dir, DefaultValues.PHYLO_HMM_MODELS, DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, False, os.path.join(options.output_dir, 'storage', 'tree')) # place bins into genome tree pplacer = PplacerRunner( threads=options.pplacer_threads ) # fix at one thread to keep memory requirements reasonable pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) self.timeKeeper.printTimeStamp()
def makeAlignmentsOfMultipleHits(self, outDir, markerFile, hmmTableFile, binIdToModels, binIdToBinMarkerSets, bIgnoreThresholds, evalueThreshold, lengthThreshold, alignOutputDir, ): """Align markers with multiple hits within a bin.""" makeSurePathExists(alignOutputDir) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # align any markers with multiple hits in a bin self.logger.info(' Aligning marker genes with multiple hits in a single bin:') # process each bin in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() for binId in binIdToModels: workerQueue.put(binId) for _ in range(self.totalThreads): workerQueue.put(None) try: calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)] writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate()
def merge(self, options): """Merge command""" self.logger.info( '[CheckM - merge] Identifying bins with complementary sets of marker genes.' ) checkDirExists(options.bin_dir) binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return markerSetParser = MarkerSetParser() if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.TREE_MARKER_SET: self.logger.error( 'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n' ) return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms')) binIds = [] for binFile in binFiles: binIds.append(binIdFromFilename(binFile)) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, "merger.table.txt", "merger.hmmer3", options.marker_file, False, False, options.bCalledGenes) # get HMM file for each bin markerSetParser = MarkerSetParser() binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, binIds, options.marker_file) # compare markers found in each bin merger = Merger() outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets, options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont) self.logger.info('Merger information written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def binUnion(self, options): """Bin union command""" self.logger.info( '[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.' ) output_dir = options.output_dir makeSurePathExists(output_dir) bin_dirs = [] checkmQaTsvs = [] for i, arg in enumerate(options.bin_or_checkm_qa_table): if i % 2 == 0: checkDirExists(arg) bin_dirs.append(arg) else: checkFileExists(arg) checkmQaTsvs.append(arg) if len(bin_dirs) < 2: self.logger.error( "Need to specify at least two bin folders, found %i: " % len(bin_dirs)) sys.exit(1) if len(bin_dirs) != len(checkmQaTsvs): self.logger.error( "Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: " % (len(bin_dirs), len(checkmQaTsvs))) sys.exit(1) binFileSets = [] for bin_dir in bin_dirs: self.logger.info( "Reading fasta files with extension %s from bin folder %s" % (options.extension, bin_dir)) binFileSets.append(self.binFiles(bin_dir, options.extension)) binUnion = BinUnion() contigConflictsOutputFile = os.path.join(output_dir, 'contigConflicts.csv') unionBinOutputFile = os.path.join(output_dir, 'union.txt') binUnion.report(bin_dirs, binFileSets, checkmQaTsvs, unionBinOutputFile, contigConflictsOutputFile, options.min_completeness, options.max_contamination)
def tetraSignatures(self, options): """Tetranucleotide signature command""" self.logger.info( '[CheckM - tetra] Calculating tetranucleotide signature of sequences.' ) checkFileExists(options.seq_file) makeSurePathExists(os.path.dirname(options.output_file)) tetraSig = GenomicSignatures(4, options.threads) tetraSig.calculate(options.seq_file, options.output_file) self.logger.info('Tetranucletoide signatures written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def ssuFinder(self, options): """SSU finder command""" self.logger.info( '[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.' ) binFiles = self.binFiles(options.bin_dir, options.extension) checkFileExists(options.seq_file) makeSurePathExists(options.output_dir) ssuFinder = SSU_Finder(options.threads) ssuFinder.run(options.seq_file, binFiles, options.output_dir, options.evalue, options.concatenate) self.timeKeeper.printTimeStamp()
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile( aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile( binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search( hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def parallelCoordPlot(self, options): """Parallel coordinate plot command""" self.logger.info( '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) checkFileExists(options.coverage_file) binFiles = self.binFiles(options.bin_dir, options.extension) # read coverage stats file coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) # calculate sequence stats for all bins self.logger.info('Calculating sequence statistics for each bin.') binStats = BinStatistics() seqStats = {} for f in binFiles: binId = binIdFromFilename(f) seqStats[binId] = binStats.sequenceStats(options.results_dir, f) # create plot for each bin plot = ParallelCoordPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(binId, seqStats, coverageStats) outputFile = os.path.join( options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def taxonSet(self, options, db=None): """Taxon set command""" self.logger.info( '[CheckM - taxon_set] Generate taxonomic-specific marker set.') path = os.path.split(options.marker_file)[0] if path: makeSurePathExists(path) taxonParser = TaxonParser() bValidSet = taxonParser.markerSet(options.rank, options.taxon, options.marker_file) if bValidSet: self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False): """Align hits to a set of common marker genes.""" self.logger.info("Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique( outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = list(binIdToModels.keys())[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def __processBin(self, outDir, queueIn, queueOut): """Thread safe bin processing.""" while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binStats = {} binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # read scaffolds scaffolds = readFasta(binFile) # calculate GC statistics GC, stdGC = self.calculateGC(scaffolds) binStats['GC'] = GC binStats['GC std'] = stdGC # calculate statistics related to contigs and scaffolds maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats( scaffolds) binStats['Genome size'] = genomeSize binStats['# ambiguous bases'] = numAmbiguousBases binStats['# scaffolds'] = len(scaffolds) binStats['# contigs'] = numContigs binStats['Longest scaffold'] = maxScaffoldLen binStats['Longest contig'] = maxContigLen binStats['N50 (scaffolds)'] = scaffold_N50 binStats['N50 (contigs)'] = contig_N50 binStats['Mean scaffold length'] = scaffoldAvgLen binStats['Mean contig length'] = contigAvgLen # calculate coding density statistics codingDensity, translationTable, numORFs = self.calculateCodingDensity( binDir, scaffolds, genomeSize) binStats['Coding density'] = codingDensity binStats['Translation table'] = translationTable binStats['# predicted genes'] = numORFs queueOut.put((binId, binStats))
def __processBin(self, outDir, queueIn, queueOut): """Thread safe bin processing.""" while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binStats = {} scaffoldStats = {} binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # read scaffolds scaffolds = readFasta(binFile) for seqId in scaffolds: scaffoldStats[seqId] = {} # calculate GC statistics GC, stdGC = self.calculateGC(scaffolds, scaffoldStats) binStats['GC'] = GC binStats['GC std'] = stdGC # calculate statistics related to scaffold lengths maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats) binStats['Genome size'] = genomeSize binStats['# ambiguous bases'] = numAmbiguousBases binStats['# scaffolds'] = len(scaffolds) binStats['# contigs'] = numContigs binStats['Longest scaffold'] = maxScaffoldLen binStats['Longest contig'] = maxContigLen binStats['N50 (scaffolds)'] = scaffold_N50 binStats['N50 (contigs)'] = contig_N50 # calculate coding density statistics codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats) binStats['Coding density'] = codingDensity binStats['Translation table'] = translationTable binStats['# predicted genes'] = numORFs queueOut.put((binId, binStats, scaffoldStats))
def coverage(self, options): """Coverage command""" self.logger.info( '[CheckM - coverage] Calculating coverage of sequences.') checkDirExists(options.bin_dir) makeSurePathExists(os.path.dirname(options.output_file)) binFiles = self.binFiles(options.bin_dir, options.extension) coverage = Coverage(options.threads) coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads, options.min_align, options.max_edit_dist, options.min_qc) self.logger.info('Coverage information written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def outliers(self, options): """Outlier command""" self.logger.info('[CheckM - outlier] Identifying outliers in bins.') checkDirExists(options.bin_dir) checkFileExists(options.tetra_profile) makeSurePathExists(os.path.dirname(options.output_file)) binFiles = self.binFiles(options.bin_dir, options.extension) binTools = BinTools() binTools.identifyOutliers(options.results_dir, binFiles, options.tetra_profile, options.distributions, options.report_type, options.output_file) self.logger.info('Outlier information written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def markerPlot(self, options): """Marker gene position plot command""" self.logger.info( '[CheckM - marker_plot] Creating marker gene position plot.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) # generate plot for each bin binFiles = self.binFiles(options.bin_dir, options.extension) resultsParser = ResultsParser(None) markerGeneStats = resultsParser.parseMarkerGeneStats( options.results_dir) binStats = resultsParser.parseBinStatsExt(options.results_dir) plot = MarkerGenePosPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting marker gene position plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 if binId not in markerGeneStats or binId not in binStats: continue # bin has no marker genes bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId]) if bPlotted: outputFile = os.path.join( options.output_dir, binId) + '.marker_pos_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) else: self.logger.info('No marker genes found in bin.') self.timeKeeper.printTimeStamp()
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False ): """Align hits to a set of common marker genes.""" self.logger.info(" Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = binIdToModels.keys()[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def binQAPlot(self, options): """Bin QA plot command""" self.logger.info( '[CheckM - bin_qa_plot] Creating bar plot of bin quality.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) # read model info # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # read sequence stats file resultsParser = ResultsParser(None) binStatsExt = resultsParser.parseBinStatsExt(options.results_dir) # create plot for each bin plot = BinQAPlot(options) bMakePlot = True if not options.bIgnoreHetero: aai = AminoAcidIdentity() aai.run(options.aai_strain, options.results_dir, None) bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero) else: bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None) if bMakePlot: outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type) plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def tetraPcaPlot(self, options): """PCA plot of tetranucleotide signatures""" self.logger.info( '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) self.logger.info('Computing PCA of tetranuclotide signatures.\n') pca = PCA() seqIds, pc, variance = pca.pcaFile(options.tetra_profile, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.tetra_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def gcBiasPlot(self, options): """GC bias plot command""" self.logger.info( '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverageWindows = CoverageWindows(options.threads) coverageProfile = coverageWindows.run(binFiles, options.bam_file, options.all_reads, options.min_align, options.max_edit_dist, options.window_size) plots = GcBiasPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, coverageProfile) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.gc_bias_plot.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def run(self, parser, outputDir): """Run standard E. coli genome to verify operation of CheckM.""" ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data', '637000110.fna') checkFileExists(ecoliFile) options = Options() options.threads = 1 options.extension = 'fna' options.bQuiet = True options.out_folder = os.path.join(outputDir, 'results') if os.path.exists(options.out_folder): shutil.rmtree(options.out_folder) makeSurePathExists(options.out_folder) print '[Step 1]: Verifying tree command.' options.bKeepAlignment = False options.bNucORFs = False options.bCalledGenes = False options.bReducedTree = True options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data') parser.tree(options) self.verifyTree(options.out_folder) print '\n [Passed]' print '\n' print '[Step 2]: Verifying tree_qa command.' options.tree_folder = options.out_folder options.out_format = 1 options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv') options.bTabTable = True parser.treeQA(options) self.verifyTreeQA(options.file) print '\n [Passed]' print '\n' print '[Step 3]: Verifying lineage_set command.' options.marker_file = os.path.join(options.out_folder, 'lineage_set_test.tsv') options.bForceDomain = False options.bootstrap = 0 options.num_genomes_markers = 30 options.num_genomes_refine = 5 options.bNoLineageSpecificRefinement = False options.bRequireTaxonomy = False options.unique = 10 options.multi = 10 parser.lineageSet(options) self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy) options.bRequireTaxonomy = True parser.lineageSet(options) self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy) print '\n [Passed]' print '\n' print '[Step 4]: Verifying analyze command.' options.bAlignTopHit = False parser.analyze(options) self.verifyAnalyze(options.out_folder) print '\n [Passed]' print '\n' print '[Step 5]: Verifying qa command.' options.alignment_file = None options.analyze_folder = options.out_folder options.out_format = 1 options.exclude_markers = None options.bSkipPseudoGeneCorrection = False options.bSkipAdjCorrection = False options.file = os.path.join(options.out_folder, 'qa_test.tsv') options.bIndividualMarkers = False options.bIgnoreThresholds = False options.aai_strain = 0.9 options.e_value = 1e-10 options.length = 0.7 options.coverage_file = None options.bTabTable = True parser.qa(options) self.verifyQA(options.file) print '\n [Passed]'
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()
def run(self, parser, outputDir): """Run standard E. coli genome to verify operation of CheckM.""" ecoliFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data', '637000110.fna') checkFileExists(ecoliFile) options = Options() options.threads = 1 options.pplacer_threads = 1 options.extension = 'fna' options.bQuiet = True options.out_folder = os.path.join(outputDir, 'results') if os.path.exists(options.out_folder): shutil.rmtree(options.out_folder) makeSurePathExists(options.out_folder) print '[Step 1]: Verifying tree command.' options.bKeepAlignment = False options.bNucORFs = False options.bCalledGenes = False options.bReducedTree = True options.bin_folder = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'test_data') parser.tree(options) self.verifyTree(options.out_folder) print '\n [Passed]' print '\n' print '[Step 2]: Verifying tree_qa command.' options.tree_folder = options.out_folder options.out_format = 1 options.file = os.path.join(options.out_folder, 'tree_qa_test.tsv') options.bTabTable = True parser.treeQA(options) self.verifyTreeQA(options.file) print '\n [Passed]' print '\n' print '[Step 3]: Verifying lineage_set command.' options.marker_file = os.path.join(options.out_folder, 'lineage_set_test.tsv') options.bForceDomain = False options.bootstrap = 0 options.num_genomes_markers = 30 options.num_genomes_refine = 5 options.bNoLineageSpecificRefinement = False options.bRequireTaxonomy = False options.unique = 10 options.multi = 10 parser.lineageSet(options) self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy) options.bRequireTaxonomy = True parser.lineageSet(options) self.verifyLineageSet(options.marker_file, options.bRequireTaxonomy) print '\n [Passed]' print '\n' print '[Step 4]: Verifying analyze command.' options.bAlignTopHit = False parser.analyze(options) self.verifyAnalyze(options.out_folder) print '\n [Passed]' print '\n' print '[Step 5]: Verifying qa command.' options.alignment_file = None options.analyze_folder = options.out_folder options.out_format = 1 options.exclude_markers = None options.bSkipPseudoGeneCorrection = False options.bSkipAdjCorrection = False options.file = os.path.join(options.out_folder, 'qa_test.tsv') options.bIndividualMarkers = False options.bIgnoreThresholds = False options.aai_strain = 0.9 options.e_value = 1e-10 options.length = 0.7 options.coverage_file = None options.bTabTable = True parser.qa(options) self.verifyQA(options.file) print '\n [Passed]'