def tree(self, options): """Tree command""" self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) # find phylogenetically informative genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_PHYLO_OUT, DefaultValues.HMMER_PHYLO_OUT, DefaultValues.PHYLO_HMM_MODELS, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) # write model information to file markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) # align identified marker genes HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentToPhyloMarkers( options.output_dir, DefaultValues.PHYLO_HMM_MODELS, DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, False, os.path.join(options.output_dir, 'storage', 'tree')) # place bins into genome tree pplacer = PplacerRunner( threads=options.pplacer_threads ) # fix at one thread to keep memory requirements reasonable pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) self.timeKeeper.printTimeStamp()
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()