def read_from_tree(self, tree, warnings=True): """Obtain the taxonomy for each extant taxa as specified by internal tree labels. Parameters ---------- tree : str or dendropy.Tree Filename of newick tree or dendropy tree object. Returns ------- dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>] Taxa indexed by unique ids. """ if isinstance(tree, str): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxa = [] node = leaf.parent_node while node: if node.label: taxa_str = node.label if ':' in taxa_str: taxa_str = taxa_str.split(':')[1] if not is_float(taxa_str): if taxa_str[-1] == ';': taxa_str = taxa_str[:-1] # check for concatenated ranks of the form: p__Crenarchaeota__c__Thermoprotei for prefix in Taxonomy.rank_prefixes: split_str = '__' + prefix if split_str in taxa_str: taxa_str = taxa_str.replace(split_str, ';' + prefix) # appears to be an internal label and not simply a support value taxa = [x.strip() for x in taxa_str.split(';')] + taxa node = node.parent_node if warnings and len(taxa) > 7: self.logger.warning('Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa)) #sys.exit(-1) # check if genus name should be appended to species label if len(taxa) == 7: genus = taxa[5][3:] species = taxa[6][3:] if genus not in species: taxa[6] = 's__' + genus + ' ' + species taxa = self.fill_trailing_ranks(taxa) taxonomy[leaf.taxon.label] = taxa return taxonomy
def read_from_tree(self, tree): """Obtain the taxonomy for each extant taxa as specified by internal tree labels. Parameters ---------- tree : str or dendropy.Tree Filename of newick tree or dendropy tree object. Returns ------- dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>] Taxa indexed by unique ids. """ if isinstance(tree, basestring): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxa = [] node = leaf.parent_node while node: if node.label: taxa_str = node.label if ':' in taxa_str: taxa_str = taxa_str.split(':')[1] if not is_float(taxa_str): # appears to be an internal label and not simply a support value taxa = [x.strip() for x in taxa_str.split(';')] + taxa node = node.parent_node if len(taxa) > 7: self.logger.error('Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa)) sys.exit(-1) # check if genus name should be appended to species label if len(taxa) == 7: genus = taxa[5][3:] species = taxa[6][3:] if genus not in species: taxa[6] = 's__' + genus + ' ' + species taxa = self.fill_missing_ranks(taxa) taxonomy[leaf.taxon.label] = taxa return taxonomy
def parse_label(label): """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information. Parameters ---------- label : str Internal label in a Newick tree. Returns ------- float Support value specified by label, or None str Taxon specified by label, or None str Auxiliary information, on None """ support = None taxon = None auxiliary_info = None if label: label = label.strip() if '|' in label: label, auxiliary_info = label.split('|') if ':' in label: support, taxon = label.split(':') support = float(support) else: if is_float(label): support = float(label) elif label != '': taxon = label return support, taxon, auxiliary_info
def parse_label(label): """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information. Parameters ---------- label : str Internal label in a Newick tree. Returns ------- float Support value specified by label, or None str Taxon specified by label, or None str Auxiliary information, on None """ support = None taxon = None auxiliary_info = None if label: label = label.strip() if "|" in label: label, auxiliary_info = label.split("|") if ":" in label: support, taxon = label.split(":") support = float(support) else: if is_float(label): support = float(label) elif label != "": taxon = label return support, taxon, auxiliary_info
def _clade_pd(self, tree, ingroup_count, outgroup_count): """Calculate PD for named clades.""" pd = {} for node in tree.preorder_node_iter(): if not node.label: continue taxon = None if ':' in node.label: _support, taxon = node.label.split(':') else: if not is_float(node.label): taxon = node.label if taxon: taxon_pd = 0 taxon_count = 0 in_taxon_pd = 0 in_taxon_count = 0 in_taxon_derep = 0 out_taxon_pd = 0 out_taxon_count = 0 out_taxon_derep = 0 for nn in node.postorder_iter(): if nn == node: continue # check if group contains taxa from # the ingroup and/or outgroup ingroup_leaves = False outgroup_leaves = False for leaf in nn.leaf_iter(): genome_id = leaf.taxon.label if genome_id in ingroup_count: ingroup_leaves = True if genome_id in outgroup_count: outgroup_leaves = True if ingroup_leaves: in_taxon_pd += nn.edge.length if outgroup_leaves: out_taxon_pd += nn.edge.length if nn.is_leaf(): genome_id = nn.taxon.label in_taxon_count += ingroup_count.get(genome_id, 0) if genome_id in ingroup_count: in_taxon_derep += 1 out_taxon_count += outgroup_count.get(genome_id, 0) if genome_id in outgroup_count: out_taxon_derep += 1 taxon_pd += nn.edge.length if taxon == 'd__Archaea': print taxon_count, in_taxon_count, out_taxon_count, in_taxon_pd, out_taxon_pd pd[taxon] = [taxon_pd, in_taxon_pd, in_taxon_count, in_taxon_derep, out_taxon_pd, out_taxon_count, out_taxon_derep] return pd
def read_viral_taxonomy_from_tree(tree): """Obtain the taxonomy for each extant taxa as specified by internal tree labels. Parameters ---------- tree : str or dendropy.Tree Filename of newick tree or dendropy tree object. Returns ------- dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>] Taxa indexed by unique ids. """ if isinstance(tree, str): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxa = [] node = leaf.parent_node while node: if node.label: taxa_str = node.label if ':' in taxa_str: taxa_str = taxa_str.split(':')[1] if not is_float(taxa_str): if taxa_str[-1] == ';': taxa_str = taxa_str[:-1] # appears to be an internal label and not simply a support value taxa = [x.strip() for x in taxa_str.split(';')] + taxa node = node.parent_node if len(taxa) > 7: logger = logging.getLogger() logger.error( 'Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, ';'.join(taxa))) sys.exit(1) for taxon in taxa: prefix = taxon[0:3] if prefix not in VIRAL_PREFIX_TRANSLATION: print('Unrecognized viral prefix for {}: {}'.format( taxon, prefix)) sys.exit(1) # fill missing ranks try: last_rank = VIRAL_RANK_PREFIXES.index(taxa[-1][0:3]) except: logger = logging.getLogger() logger.error('Taxon {} is missing rank prefix: {}'.format( leaf.taxon.label, ';'.join(taxa))) sys.exit(1) for i in range(last_rank + 1, len(VIRAL_RANK_PREFIXES)): taxa.append(VIRAL_RANK_PREFIXES[i]) taxonomy[leaf.taxon.label] = taxa return taxonomy
def _clade_pd(self, tree, ingroup_count, outgroup_count): """Calculate PD for named clades.""" pd = {} for node in tree.preorder_node_iter(): if not node.label: continue taxon = None if ':' in node.label: _support, taxon = node.label.split(':') else: if not is_float(node.label): taxon = node.label if taxon: taxon_pd = 0 taxon_count = 0 in_taxon_pd = 0 in_taxon_count = 0 in_taxon_derep = 0 out_taxon_pd = 0 out_taxon_count = 0 out_taxon_derep = 0 for nn in node.postorder_iter(): if nn == node: continue # check if group contains taxa from # the ingroup and/or outgroup ingroup_leaves = False outgroup_leaves = False for leaf in nn.leaf_iter(): genome_id = leaf.taxon.label if genome_id in ingroup_count: ingroup_leaves = True if genome_id in outgroup_count: outgroup_leaves = True if ingroup_leaves: in_taxon_pd += nn.edge.length if outgroup_leaves: out_taxon_pd += nn.edge.length if nn.is_leaf(): genome_id = nn.taxon.label in_taxon_count += ingroup_count.get(genome_id, 0) if genome_id in ingroup_count: in_taxon_derep += 1 out_taxon_count += outgroup_count.get(genome_id, 0) if genome_id in outgroup_count: out_taxon_derep += 1 taxon_pd += nn.edge.length if taxon == 'd__Archaea': print taxon_count, in_taxon_count, out_taxon_count, in_taxon_pd, out_taxon_pd pd[taxon] = [ taxon_pd, in_taxon_pd, in_taxon_count, in_taxon_derep, out_taxon_pd, out_taxon_count, out_taxon_derep ] return pd