예제 #1
0
    def read_from_tree(self, tree, warnings=True):
        """Obtain the taxonomy for each extant taxa as specified by internal tree labels.

        Parameters
        ----------
        tree : str or dendropy.Tree
            Filename of newick tree or dendropy tree object.

        Returns
        -------
        dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
            Taxa indexed by unique ids.
        """

        if isinstance(tree, str):
            tree = dendropy.Tree.get_from_path(tree, 
                                                schema='newick', 
                                                rooting="force-rooted", 
                                                preserve_underscores=True)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxa = []

            node = leaf.parent_node
            while node:
                if node.label:
                    taxa_str = node.label
                    if ':' in taxa_str:
                        taxa_str = taxa_str.split(':')[1]

                    if not is_float(taxa_str):
                        if taxa_str[-1] == ';':
                            taxa_str = taxa_str[:-1]
                            
                        # check for concatenated ranks of the form: p__Crenarchaeota__c__Thermoprotei
                        for prefix in Taxonomy.rank_prefixes:
                            split_str = '__' + prefix
                            if split_str in taxa_str:
                                taxa_str = taxa_str.replace(split_str, ';' + prefix)
                                
                        # appears to be an internal label and not simply a support value
                        taxa = [x.strip() for x in taxa_str.split(';')] + taxa
                node = node.parent_node

            if warnings and len(taxa) > 7:
                self.logger.warning('Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa))
                #sys.exit(-1)

            # check if genus name should be appended to species label
            if len(taxa) == 7:
                genus = taxa[5][3:]
                species = taxa[6][3:]
                if genus not in species:
                    taxa[6] = 's__' + genus + ' ' + species

            taxa = self.fill_trailing_ranks(taxa)
            taxonomy[leaf.taxon.label] = taxa

        return taxonomy
예제 #2
0
    def read_from_tree(self, tree):
        """Obtain the taxonomy for each extant taxa as specified by internal tree labels.

        Parameters
        ----------
        tree : str or dendropy.Tree
            Filename of newick tree or dendropy tree object.

        Returns
        -------
        dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
            Taxa indexed by unique ids.
        """

        if isinstance(tree, basestring):
            tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxa = []

            node = leaf.parent_node
            while node:
                if node.label:
                    taxa_str = node.label
                    if ':' in taxa_str:
                        taxa_str = taxa_str.split(':')[1]

                    if not is_float(taxa_str):
                        # appears to be an internal label and not simply a support value
                        taxa = [x.strip() for x in taxa_str.split(';')] + taxa
                node = node.parent_node

            if len(taxa) > 7:
                self.logger.error('Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa))
                sys.exit(-1)

            # check if genus name should be appended to species label
            if len(taxa) == 7:
                genus = taxa[5][3:]
                species = taxa[6][3:]
                if genus not in species:
                    taxa[6] = 's__' + genus + ' ' + species

            taxa = self.fill_missing_ranks(taxa)
            taxonomy[leaf.taxon.label] = taxa

        return taxonomy
예제 #3
0
def parse_label(label):
    """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information.

    Parameters
    ----------
    label : str
        Internal label in a Newick tree.

    Returns
    -------
    float
        Support value specified by label, or None
    str
        Taxon specified by label, or None
    str
        Auxiliary information, on None
    """

    support = None
    taxon = None
    auxiliary_info = None

    if label:
        label = label.strip()
        if '|' in label:
            label, auxiliary_info = label.split('|')

        if ':' in label:
            support, taxon = label.split(':')
            support = float(support)
        else:
            if is_float(label):
                support = float(label)
            elif label != '':
                taxon = label

    return support, taxon, auxiliary_info
예제 #4
0
def parse_label(label):
    """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information.

    Parameters
    ----------
    label : str
        Internal label in a Newick tree.

    Returns
    -------
    float
        Support value specified by label, or None
    str
        Taxon specified by label, or None
    str
        Auxiliary information, on None
    """

    support = None
    taxon = None
    auxiliary_info = None

    if label:
        label = label.strip()
        if "|" in label:
            label, auxiliary_info = label.split("|")

        if ":" in label:
            support, taxon = label.split(":")
            support = float(support)
        else:
            if is_float(label):
                support = float(label)
            elif label != "":
                taxon = label

    return support, taxon, auxiliary_info
    def _clade_pd(self, tree, ingroup_count, outgroup_count):
        """Calculate PD for named clades."""
        
        pd = {}
        for node in tree.preorder_node_iter():
            if not node.label:
                continue

            taxon = None
            if ':' in node.label:
                _support, taxon = node.label.split(':')
            else:
                if not is_float(node.label):
                    taxon = node.label

            if taxon:
                taxon_pd = 0
                taxon_count = 0
                in_taxon_pd = 0
                in_taxon_count = 0
                in_taxon_derep = 0
                out_taxon_pd = 0
                out_taxon_count = 0
                out_taxon_derep = 0
                for nn in node.postorder_iter():
                    if nn == node:
                        continue
                        
                    # check if group contains taxa from
                    # the ingroup and/or outgroup
                    ingroup_leaves = False
                    outgroup_leaves = False
                    for leaf in nn.leaf_iter():
                        genome_id = leaf.taxon.label
                        if genome_id in ingroup_count:
                            ingroup_leaves = True
                        
                        if genome_id in outgroup_count:
                            outgroup_leaves = True
                            
                    if ingroup_leaves:
                        in_taxon_pd += nn.edge.length

                    if outgroup_leaves:
                        out_taxon_pd += nn.edge.length
                        
                    if nn.is_leaf():
                        genome_id = nn.taxon.label
                        
                        in_taxon_count += ingroup_count.get(genome_id, 0)
                        if genome_id in ingroup_count:
                            in_taxon_derep += 1
                            
                        out_taxon_count += outgroup_count.get(genome_id, 0)
                        if genome_id in outgroup_count:
                            out_taxon_derep += 1
                            
                    taxon_pd += nn.edge.length

                if taxon == 'd__Archaea':
                    print taxon_count, in_taxon_count, out_taxon_count, in_taxon_pd, out_taxon_pd
                pd[taxon] = [taxon_pd, in_taxon_pd, in_taxon_count, in_taxon_derep, out_taxon_pd, out_taxon_count, out_taxon_derep]
                
        return pd
예제 #6
0
def read_viral_taxonomy_from_tree(tree):
    """Obtain the taxonomy for each extant taxa as specified by internal tree labels.

    Parameters
    ----------
    tree : str or dendropy.Tree
        Filename of newick tree or dendropy tree object.

    Returns
    -------
    dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
        Taxa indexed by unique ids.
    """

    if isinstance(tree, str):
        tree = dendropy.Tree.get_from_path(tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

    taxonomy = {}
    for leaf in tree.leaf_node_iter():
        taxa = []

        node = leaf.parent_node
        while node:
            if node.label:
                taxa_str = node.label
                if ':' in taxa_str:
                    taxa_str = taxa_str.split(':')[1]

                if not is_float(taxa_str):
                    if taxa_str[-1] == ';':
                        taxa_str = taxa_str[:-1]

                    # appears to be an internal label and not simply a support value
                    taxa = [x.strip() for x in taxa_str.split(';')] + taxa
            node = node.parent_node

        if len(taxa) > 7:
            logger = logging.getLogger()
            logger.error(
                'Invalid taxonomy string read from tree for taxon %s: %s' %
                (leaf.taxon.label, ';'.join(taxa)))
            sys.exit(1)

        for taxon in taxa:
            prefix = taxon[0:3]
            if prefix not in VIRAL_PREFIX_TRANSLATION:
                print('Unrecognized viral prefix for {}: {}'.format(
                    taxon, prefix))
                sys.exit(1)

        # fill missing ranks
        try:
            last_rank = VIRAL_RANK_PREFIXES.index(taxa[-1][0:3])
        except:
            logger = logging.getLogger()
            logger.error('Taxon {} is missing rank prefix: {}'.format(
                leaf.taxon.label, ';'.join(taxa)))
            sys.exit(1)

        for i in range(last_rank + 1, len(VIRAL_RANK_PREFIXES)):
            taxa.append(VIRAL_RANK_PREFIXES[i])

        taxonomy[leaf.taxon.label] = taxa

    return taxonomy
예제 #7
0
    def _clade_pd(self, tree, ingroup_count, outgroup_count):
        """Calculate PD for named clades."""

        pd = {}
        for node in tree.preorder_node_iter():
            if not node.label:
                continue

            taxon = None
            if ':' in node.label:
                _support, taxon = node.label.split(':')
            else:
                if not is_float(node.label):
                    taxon = node.label

            if taxon:
                taxon_pd = 0
                taxon_count = 0
                in_taxon_pd = 0
                in_taxon_count = 0
                in_taxon_derep = 0
                out_taxon_pd = 0
                out_taxon_count = 0
                out_taxon_derep = 0
                for nn in node.postorder_iter():
                    if nn == node:
                        continue

                    # check if group contains taxa from
                    # the ingroup and/or outgroup
                    ingroup_leaves = False
                    outgroup_leaves = False
                    for leaf in nn.leaf_iter():
                        genome_id = leaf.taxon.label
                        if genome_id in ingroup_count:
                            ingroup_leaves = True

                        if genome_id in outgroup_count:
                            outgroup_leaves = True

                    if ingroup_leaves:
                        in_taxon_pd += nn.edge.length

                    if outgroup_leaves:
                        out_taxon_pd += nn.edge.length

                    if nn.is_leaf():
                        genome_id = nn.taxon.label

                        in_taxon_count += ingroup_count.get(genome_id, 0)
                        if genome_id in ingroup_count:
                            in_taxon_derep += 1

                        out_taxon_count += outgroup_count.get(genome_id, 0)
                        if genome_id in outgroup_count:
                            out_taxon_derep += 1

                    taxon_pd += nn.edge.length

                if taxon == 'd__Archaea':
                    print taxon_count, in_taxon_count, out_taxon_count, in_taxon_pd, out_taxon_pd
                pd[taxon] = [
                    taxon_pd, in_taxon_pd, in_taxon_count, in_taxon_derep,
                    out_taxon_pd, out_taxon_count, out_taxon_derep
                ]

        return pd