def run(self, aaiStrainThreshold, outDir, alignmentOutputFile): """Calculate AAI between input alignments.""" self.logger.info('Calculating AAI between multi-copy marker genes.') if alignmentOutputFile: fout = open(alignmentOutputFile, 'w') # calculate AAI for duplicate marker genes binIds = getBinIdsFromOutDir(outDir) aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa') for binId in binIds: binPath = os.path.join(aaiOutputDir, binId) if not os.path.exists(binPath): continue for f in os.listdir(binPath): if not f.endswith('.masked.faa'): continue markerId = f[0:f.find('.')] seqs = readFasta(os.path.join(binPath, f)) # calculate AAI between all pairs of seqs for i in range(0, len(seqs)): seqIdI = list(seqs.keys())[i] binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)] seqI = seqs[seqIdI] for j in range(i + 1, len(seqs)): seqIdJ = list(seqs.keys())[j] binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)] seqJ = seqs[seqIdJ] if binIdI == binIdJ: aai = self.aai(seqI, seqJ) if alignmentOutputFile: fout.write(binId + ',' + markerId + '\n') fout.write(seqIdI + '\t' + seqI + '\n') fout.write(seqIdJ + '\t' + seqJ + '\n') fout.write('AAI: %.3f\n' % aai) fout.write('\n') if binIdI not in self.aaiRawScores: self.aaiRawScores[binIdI] = defaultdict(list) self.aaiRawScores[binIdI][markerId].append(aai) else: # something is wrong as the bin Ids should always be the same self.logger.error(' [Error] Bin ids do not match.') sys.exit(1) if alignmentOutputFile: fout.close() # calculate strain heterogeneity for each marker gene in each bin self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get taxonomy for each bin binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds) # write table if not bLineageStatistics: self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile) else: # get taxonomy of sister lineage for each bin binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds) binIdToUID = self.getInsertionBranchId(outDir, binIds) binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds) self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get taxonomy for each bin binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds) # get weighted ML likelihood #pplacerJsonFile = os.path.join(outDir, 'storage', 'tree', 'concatenated.pplacer.json') #binIdToWeightedML = self.readPlacementFile(pplacerJsonFile) # write table if not bLineageStatistics: self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile) else: # get taxonomy of sister lineage for each bin binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds) binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds) self.__printFullTable(binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
def run(self, aaiStrainThreshold, outDir, alignmentOutputFile): """Calculate AAI between input alignments.""" self.logger.info(' Calculating AAI between multi-copy marker genes.') if alignmentOutputFile: fout = open(alignmentOutputFile, 'w') # calculate AAI for duplicate marker genes binIds = getBinIdsFromOutDir(outDir) aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa') for binId in binIds: binPath = os.path.join(aaiOutputDir, binId) if not os.path.exists(binPath): continue for f in os.listdir(binPath): if not f.endswith('.masked.faa'): continue markerId = f[0:f.find('.')] seqs = readFasta(os.path.join(binPath, f)) # calculate AAI between all pairs of seqs for i in xrange(0, len(seqs)): seqIdI = seqs.keys()[i] binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)] seqI = seqs[seqIdI] for j in xrange(i+1, len(seqs)): seqIdJ = seqs.keys()[j] binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)] seqJ = seqs[seqIdJ] if binIdI == binIdJ: aai = self.aai(seqI, seqJ) if alignmentOutputFile: fout.write(binId + ',' + markerId + '\n') fout.write(seqIdI + '\t' + seqI + '\n') fout.write(seqIdJ + '\t' + seqJ + '\n') fout.write('AAI: %.3f\n' % aai) fout.write('\n') if binIdI not in self.aaiRawScores: self.aaiRawScores[binIdI] = defaultdict(list) self.aaiRawScores[binIdI][markerId].append(aai) else: # something is wrong as the bin Ids should always be the same self.logger.error(' [Error] Bin ids do not match.') sys.exit() if alignmentOutputFile: fout.close() # calculate strain heterogeneity for each marker gene in each bin self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
def qa(self, options): """QA command""" self.logger.info('[CheckM - qa] Tabulating genome statistics.') checkDirExists(options.analyze_dir) if options.exclude_markers: checkFileExists(options.exclude_markers) # calculate AAI between marks with multiple hits in a single bin aai = AminoAcidIdentity() aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir), options.marker_file, options.exclude_markers) # get results for each bin RP = ResultsParser(binIdToModels) RP.analyseResults( options.analyze_dir, DefaultValues.BIN_STATS_OUT, DefaultValues.HMMER_TABLE_OUT, bIgnoreThresholds=options.bIgnoreThresholds, evalueThreshold=options.e_value, lengthThreshold=options.length, bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, bSkipAdjCorrection=options.bSkipAdjCorrection) RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_dir) RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 statusStr = '' for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 sys.stderr.write(' ' * len(statusStr) + '\r') # clear previous line statusStr = ' Finished processing %d of %d (%.2f%%) bins (current: %s).' % ( numProcessedBins, len(binIds), float(numProcessedBins) * 100 / len(binIds), binId) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet( rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[ uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[ binId].countUniqueHits() tempForceDomain = bForceDomain or ( uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet( curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes( markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 statusStr = ' Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds)) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()