def generate_skbio_tree(classification, existing_tree=None): from skbio.tree import MissingNodeError, TreeNode otus = classification.results()['table'] if existing_tree is None: tree = TreeNode(name='1', length=1) tree.tax_name = 'Root' tree.rank = 'no rank' else: tree = existing_tree # we use this to keep track of nodes that haven't had their parent added yet unlinked = defaultdict(list) for otu in otus: tax_id = otu['tax_id'] # skip nodes already in the tree try: tree.find(tax_id) continue except MissingNodeError: pass # try to find a parent (if it exists) parent_id = otu['parent_tax_id'] try: parent = tree.find(parent_id) # the children are merged out here (only if we have a parent) to # make sure we're not creating trees inside unlinked itself children = _merge_unlinked(tax_id, unlinked) except MissingNodeError: parent = None children = None # create the node node = TreeNode(name=tax_id, length=1, children=children) node.tax_name = otu.get('name', '') node.rank = otu.get('rank', 'no rank') # either add the node to its parent or keep track of it until its # parent is "in tree" too if parent is not None: parent.append(node) else: unlinked[parent_id].append(node) assert len( unlinked) == 0, 'some unlinked nodes were not included in the tree' return tree
def unifrac(classifications, weighted=True, field='readcount_w_children', rank='species', strict=False): """ A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence """ assert field in ACCEPTABLE_FIELDS counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) tree = None for c in classifications: if strict and c.job.id != classifications[0].job.id: raise OneCodexException('All Classifications must have the same Job for Unifrac') tree = generate_skbio_tree(c, existing_tree=tree) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here new_tree = TreeNode(name='fake root') new_tree.rank = 'no rank' new_tree.append(tree) # prune low-level nodes off the tree so the tips are what we're comparing prune_to_rank(new_tree, rank=rank) if weighted: return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids) else: return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids)
def unifrac(self, weighted=True, rank="auto"): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances import skbio.diversity if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def unifrac(self, weighted=True, rank="auto"): """A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def tree_build(self): """Build a tree from the taxonomy data present in this object. This is designed for use with `ClassificationsDataFrame` or `SampleCollection`. Returns ------- `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current analysis and their parents leading back to the root node. """ from skbio.tree import TreeNode # build all the nodes nodes = {} for tax_id in self.taxonomy.index: node = TreeNode(name=tax_id, length=1) node.tax_name = self.taxonomy["name"][tax_id] node.rank = self.taxonomy["rank"][tax_id] node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id] nodes[tax_id] = node # generate all the links for tax_id in self.taxonomy.index: try: parent = nodes[nodes[tax_id].parent_tax_id] except KeyError: if tax_id != "1": warnings.warn( "tax_id={} has parent_tax_id={} which is not in tree" "".format(tax_id, nodes[tax_id].parent_tax_id)) continue parent.append(nodes[tax_id]) return nodes["1"]
def tree_build(self): """Build a tree from the taxonomy data present in this `ClassificationsDataFrame` or `SampleCollection`. Returns ------- `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current analysis and their parents leading back to the root node. """ from skbio.tree import TreeNode # build all the nodes nodes = {} for tax_id in self.taxonomy.index: node = TreeNode(name=tax_id, length=1) node.tax_name = self.taxonomy["name"][tax_id] node.rank = self.taxonomy["rank"][tax_id] node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id] nodes[tax_id] = node # generate all the links for tax_id in self.taxonomy.index: try: parent = nodes[nodes[tax_id].parent_tax_id] except KeyError: if tax_id != "1": warnings.warn( "tax_id={} has parent_tax_id={} which is not in tree" "".format(tax_id, nodes[tax_id].parent_tax_id) ) continue parent.append(nodes[tax_id]) return nodes["1"]
def unifrac(self, weighted=True, rank=Rank.Auto): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import skbio.diversity df = self.to_df(rank=rank, normalize=self._guess_normalized()) ocx_rank = df.ocx_rank # The scikit-bio implementations of phylogenetic metrics require integer counts if self._guess_normalized(): df = df * 10e9 tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=ocx_rank) # `scikit-bio` requires that the tree root has no more than 2 # children, otherwise it considers it "unrooted". # # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89 # # Our taxonomy root regularly has more than 2 children, so we # add a fake parent of `root` to the tree here. from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( BetaDiversityMetric.WeightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, normalized=True, ) else: return skbio.diversity.beta_diversity( BetaDiversityMetric.UnweightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, )