def markerSet(self, rank, taxon, markerFile): """Obtain specified taxonomic-specific marker set.""" taxonMarkerSets = self.readMarkerSets() if rank not in taxonMarkerSets: self.logger.error(' Unrecognized taxonomic rank: ' + rank) return False elif taxon not in taxonMarkerSets[rank]: self.logger.error(' Unrecognized taxon: %s (in rank %s): ' % (taxon, rank)) return False markerSet = taxonMarkerSets[rank][taxon] taxonomy = markerSet.lineageStr.split(';')[::-1] binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET) for i, taxon in enumerate(taxonomy): if rank != 'life': rank = ranksByLevel[len(taxonomy) - i - 1] if rank == 'species': taxon = taxonomy[1] + ' ' + taxonomy[0] markerSet = taxonMarkerSets[rank][taxon] numMarkers, numMarkerSets = markerSet.size() self.logger.info( ' Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets)) self.logger.info( ' Marker set inferred from %d reference genomes.' % markerSet.numGenomes) markerSet.lineageStr = taxon binMarkerSets.addMarkerSet(markerSet) fout = open(markerFile, 'w') fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n') binMarkerSets.write(fout) fout.close() return True
def markerSet(self, rank, taxon, markerFile): """Obtain specified taxonomic-specific marker set.""" taxonMarkerSets = self.readMarkerSets() if rank not in taxonMarkerSets: self.logger.error(' Unrecognized taxonomic rank: ' + rank) return False elif taxon not in taxonMarkerSets[rank]: self.logger.error(' Unrecognized taxon: %s (in rank %s): ' % (taxon, rank)) return False markerSet = taxonMarkerSets[rank][taxon] taxonomy = markerSet.lineageStr.split(';')[::-1] binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET) for i, taxon in enumerate(taxonomy): if rank != 'life': rank = ranksByLevel[len(taxonomy)-i-1] if rank == 'species': taxon = taxonomy[1] + ' ' + taxonomy[0] markerSet = taxonMarkerSets[rank][taxon] numMarkers, numMarkerSets = markerSet.size() self.logger.info(' Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets)) self.logger.info(' Marker set inferred from %d reference genomes.' % markerSet.numGenomes) markerSet.lineageStr = taxon binMarkerSets.addMarkerSet(markerSet) fout = open(markerFile, 'w') fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n') binMarkerSets.write(fout) fout.close() return True
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 statusStr = '' for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 sys.stderr.write(' ' * len(statusStr) + '\r') # clear previous line statusStr = ' Finished processing %d of %d (%.2f%%) bins (current: %s).' % ( numProcessedBins, len(binIds), float(numProcessedBins) * 100 / len(binIds), binId) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet( rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[ uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[ binId].countUniqueHits() tempForceDomain = bForceDomain or ( uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet( curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes( markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 statusStr = ' Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds)) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()