Exemple #1
0
def parse_label(label):
    """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information.

    Parameters
    ----------
    label : str
        Internal label in a Newick tree.

    Returns
    -------
    float
        Support value specified by label, or None
    str
        Taxon specified by label, or None
    str
        Auxiliary information, on None
    """

    support = None
    taxon = None
    auxiliary_info = None

    if label:
        label = label.strip()
        if '|' in label:
            label, auxiliary_info = label.split('|')

        if ':' in label:
            support, taxon = label.split(':')
            support = float(support)
        else:
            if is_float(label):
                support = float(label)
            elif label != '':
                taxon = label

    return support, taxon, auxiliary_info
Exemple #2
0
    def read_from_tree(self, tree, warnings=True):
        """Obtain the taxonomy for each extant taxa as specified by internal tree labels.

        Parameters
        ----------
        tree : str or dendropy.Tree
            Filename of newick tree or dendropy tree object.

        Returns
        -------
        dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
            Taxa indexed by unique ids.
        """

        if isinstance(tree, str):
            tree = dendropy.Tree.get_from_path(tree,
                                               schema='newick',
                                               rooting="force-rooted",
                                               preserve_underscores=True)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxa = []

            node = leaf.parent_node
            while node:
                if node.label:
                    taxa_str = node.label
                    if ':' in taxa_str:
                        taxa_str = taxa_str.split(':')[1]

                    if not is_float(taxa_str):
                        if taxa_str[-1] == ';':
                            taxa_str = taxa_str[:-1]

                        # check for concatenated ranks of the form:
                        # p__Crenarchaeota__c__Thermoprotei
                        for prefix in Taxonomy.rank_prefixes:
                            split_str = '__' + prefix
                            if split_str in taxa_str:
                                taxa_str = taxa_str.replace(
                                    split_str, ';' + prefix)

                        # appears to be an internal label and not simply a
                        # support value
                        taxa = [x.strip() for x in taxa_str.split(';')] + taxa
                node = node.parent_node

            if warnings and len(taxa) > 7:
                self.logger.warning(
                    'Invalid taxonomy string read from tree for taxon %s: %s' %
                    (leaf.taxon.label, taxa))
                # sys.exit(-1)

            # check if genus name should be appended to species label
            if len(taxa) == 7:
                genus = taxa[5][3:]
                species = taxa[6][3:]
                if genus not in species:
                    taxa[6] = 's__' + genus + ' ' + species

            taxa = self.fill_trailing_ranks(taxa)
            taxonomy[leaf.taxon.label] = taxa

        return taxonomy