def merge(self, options): """Merge command""" self.logger.info( '[CheckM - merge] Identifying bins with complementary sets of marker genes.' ) checkDirExists(options.bin_dir) binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return markerSetParser = MarkerSetParser() if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.TREE_MARKER_SET: self.logger.error( 'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n' ) return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms')) binIds = [] for binFile in binFiles: binIds.append(binIdFromFilename(binFile)) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, "merger.table.txt", "merger.hmmer3", options.marker_file, False, False, options.bCalledGenes) # get HMM file for each bin markerSetParser = MarkerSetParser() binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, binIds, options.marker_file) # compare markers found in each bin merger = Merger() outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets, options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont) self.logger.info('Merger information written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def tree(self, options): """Tree command""" self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) # find phylogenetically informative genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_PHYLO_OUT, DefaultValues.HMMER_PHYLO_OUT, DefaultValues.PHYLO_HMM_MODELS, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) # write model information to file markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) # align identified marker genes HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentToPhyloMarkers( options.output_dir, DefaultValues.PHYLO_HMM_MODELS, DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, False, os.path.join(options.output_dir, 'storage', 'tree')) # place bins into genome tree pplacer = PplacerRunner( threads=options.pplacer_threads ) # fix at one thread to keep memory requirements reasonable pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) self.timeKeeper.printTimeStamp()
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()