def parse_label(label): """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information. Parameters ---------- label : str Internal label in a Newick tree. Returns ------- float Support value specified by label, or None str Taxon specified by label, or None str Auxiliary information, on None """ support = None taxon = None auxiliary_info = None if label: label = label.strip() if '|' in label: label, auxiliary_info = label.split('|') if ':' in label: support, taxon = label.split(':') support = float(support) else: if is_float(label): support = float(label) elif label != '': taxon = label return support, taxon, auxiliary_info
def read_from_tree(self, tree, warnings=True): """Obtain the taxonomy for each extant taxa as specified by internal tree labels. Parameters ---------- tree : str or dendropy.Tree Filename of newick tree or dendropy tree object. Returns ------- dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>] Taxa indexed by unique ids. """ if isinstance(tree, str): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxa = [] node = leaf.parent_node while node: if node.label: taxa_str = node.label if ':' in taxa_str: taxa_str = taxa_str.split(':')[1] if not is_float(taxa_str): if taxa_str[-1] == ';': taxa_str = taxa_str[:-1] # check for concatenated ranks of the form: # p__Crenarchaeota__c__Thermoprotei for prefix in Taxonomy.rank_prefixes: split_str = '__' + prefix if split_str in taxa_str: taxa_str = taxa_str.replace( split_str, ';' + prefix) # appears to be an internal label and not simply a # support value taxa = [x.strip() for x in taxa_str.split(';')] + taxa node = node.parent_node if warnings and len(taxa) > 7: self.logger.warning( 'Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa)) # sys.exit(-1) # check if genus name should be appended to species label if len(taxa) == 7: genus = taxa[5][3:] species = taxa[6][3:] if genus not in species: taxa[6] = 's__' + genus + ' ' + species taxa = self.fill_trailing_ranks(taxa) taxonomy[leaf.taxon.label] = taxa return taxonomy