Example #1
0
def generate_skbio_tree(classification, existing_tree=None):
    from skbio.tree import MissingNodeError, TreeNode

    otus = classification.results()['table']
    if existing_tree is None:
        tree = TreeNode(name='1', length=1)
        tree.tax_name = 'Root'
        tree.rank = 'no rank'
    else:
        tree = existing_tree

    # we use this to keep track of nodes that haven't had their parent added yet
    unlinked = defaultdict(list)

    for otu in otus:
        tax_id = otu['tax_id']
        # skip nodes already in the tree
        try:
            tree.find(tax_id)
            continue
        except MissingNodeError:
            pass

        # try to find a parent (if it exists)
        parent_id = otu['parent_tax_id']
        try:
            parent = tree.find(parent_id)
            # the children are merged out here (only if we have a parent) to
            # make sure we're not creating trees inside unlinked itself
            children = _merge_unlinked(tax_id, unlinked)
        except MissingNodeError:
            parent = None
            children = None

        # create the node
        node = TreeNode(name=tax_id, length=1, children=children)
        node.tax_name = otu.get('name', '')
        node.rank = otu.get('rank', 'no rank')

        # either add the node to its parent or keep track of it until its
        # parent is "in tree" too
        if parent is not None:
            parent.append(node)
        else:
            unlinked[parent_id].append(node)

    assert len(
        unlinked) == 0, 'some unlinked nodes were not included in the tree'

    return tree
Example #2
0
def unifrac(classifications, weighted=True,
            field='readcount_w_children', rank='species', strict=False):
    """
    A beta diversity metric that takes into account the relative relatedness of community members.
    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
    """
    assert field in ACCEPTABLE_FIELDS
    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

    tree = None
    for c in classifications:
        if strict and c.job.id != classifications[0].job.id:
            raise OneCodexException('All Classifications must have the same Job for Unifrac')
        tree = generate_skbio_tree(c, existing_tree=tree)

    # there's a bug (?) in skbio where it expects the root to only have
    # one child, so we do a little faking here
    new_tree = TreeNode(name='fake root')
    new_tree.rank = 'no rank'
    new_tree.append(tree)

    # prune low-level nodes off the tree so the tips are what we're comparing
    prune_to_rank(new_tree, rank=rank)

    if weighted:
        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
    else:
        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
Example #3
0
    def unifrac(self, weighted=True, rank="auto"):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        import skbio.diversity

        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
Example #4
0
    def unifrac(self, weighted=True, rank="auto"):
        """A beta diversity metric that takes into account the relative relatedness of community
        members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
Example #5
0
    def tree_build(self):
        """Build a tree from the taxonomy data present in this object.

        This is designed for use with `ClassificationsDataFrame` or `SampleCollection`.

        Returns
        -------
        `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current
        analysis and their parents leading back to the root node.
        """
        from skbio.tree import TreeNode

        # build all the nodes
        nodes = {}

        for tax_id in self.taxonomy.index:
            node = TreeNode(name=tax_id, length=1)
            node.tax_name = self.taxonomy["name"][tax_id]
            node.rank = self.taxonomy["rank"][tax_id]
            node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id]

            nodes[tax_id] = node

        # generate all the links
        for tax_id in self.taxonomy.index:
            try:
                parent = nodes[nodes[tax_id].parent_tax_id]
            except KeyError:
                if tax_id != "1":
                    warnings.warn(
                        "tax_id={} has parent_tax_id={} which is not in tree"
                        "".format(tax_id, nodes[tax_id].parent_tax_id))

                continue

            parent.append(nodes[tax_id])

        return nodes["1"]
Example #6
0
    def tree_build(self):
        """Build a tree from the taxonomy data present in this `ClassificationsDataFrame` or
        `SampleCollection`.

        Returns
        -------
        `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current
        analysis and their parents leading back to the root node.
        """
        from skbio.tree import TreeNode

        # build all the nodes
        nodes = {}

        for tax_id in self.taxonomy.index:
            node = TreeNode(name=tax_id, length=1)
            node.tax_name = self.taxonomy["name"][tax_id]
            node.rank = self.taxonomy["rank"][tax_id]
            node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id]

            nodes[tax_id] = node

        # generate all the links
        for tax_id in self.taxonomy.index:
            try:
                parent = nodes[nodes[tax_id].parent_tax_id]
            except KeyError:
                if tax_id != "1":
                    warnings.warn(
                        "tax_id={} has parent_tax_id={} which is not in tree"
                        "".format(tax_id, nodes[tax_id].parent_tax_id)
                    )

                continue

            parent.append(nodes[tax_id])

        return nodes["1"]
Example #7
0
    def unifrac(self, weighted=True, rank=Rank.Auto):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import skbio.diversity

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        ocx_rank = df.ocx_rank
        # The scikit-bio implementations of phylogenetic metrics require integer counts
        if self._guess_normalized():
            df = df * 10e9

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=ocx_rank)

        # `scikit-bio` requires that the tree root has no more than 2
        # children, otherwise it considers it "unrooted".
        #
        # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89
        #
        # Our taxonomy root regularly has more than 2 children, so we
        # add a fake parent of `root` to the tree here.
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.WeightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
                normalized=True,
            )
        else:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.UnweightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
            )