def unifrac(classifications, weighted=True, field='readcount_w_children', rank='species', strict=False): """ A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence """ assert field in ACCEPTABLE_FIELDS counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) tree = None for c in classifications: if strict and c.job.id != classifications[0].job.id: raise OneCodexException('All Classifications must have the same Job for Unifrac') tree = generate_skbio_tree(c, existing_tree=tree) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here new_tree = TreeNode(name='fake root') new_tree.rank = 'no rank' new_tree.append(tree) # prune low-level nodes off the tree so the tips are what we're comparing prune_to_rank(new_tree, rank=rank) if weighted: return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids) else: return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, tree=new_tree, otu_ids=tax_ids)
def unifrac(self, weighted=True, rank="auto"): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances import skbio.diversity if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def unifrac(self, weighted=True, rank="auto"): """A beta diversity metric that takes into account the relative relatedness of community members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ # needs read counts, not relative abundances if self._guess_normalized(): raise OneCodexException("UniFrac requires unnormalized read counts.") df = self.to_df(rank=rank, normalize=False) counts = [] for c_id in df.index: counts.append(df.loc[c_id].tolist()) tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=df.ocx_rank) # there's a bug (?) in skbio where it expects the root to only have # one child, so we do a little faking here from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids ) else: return skbio.diversity.beta_diversity( "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids )
def unifrac(self, weighted=True, rank=Rank.Auto): """Calculate the UniFrac beta diversity metric. UniFrac takes into account the relatedness of community members. Weighted UniFrac considers abundances, unweighted UniFrac considers presence. Parameters ---------- weighted : `bool` Calculate the weighted (True) or unweighted (False) distance metric. rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional Analysis will be restricted to abundances of taxa at the specified level. Returns ------- skbio.stats.distance.DistanceMatrix, a distance matrix. """ import skbio.diversity df = self.to_df(rank=rank, normalize=self._guess_normalized()) ocx_rank = df.ocx_rank # The scikit-bio implementations of phylogenetic metrics require integer counts if self._guess_normalized(): df = df * 10e9 tax_ids = df.keys().tolist() tree = self.tree_build() tree = self.tree_prune_rank(tree, rank=ocx_rank) # `scikit-bio` requires that the tree root has no more than 2 # children, otherwise it considers it "unrooted". # # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89 # # Our taxonomy root regularly has more than 2 children, so we # add a fake parent of `root` to the tree here. from skbio.tree import TreeNode new_tree = TreeNode(name="fake root") new_tree.rank = "no rank" new_tree.append(tree) # then finally run the calculation and return if weighted: return skbio.diversity.beta_diversity( BetaDiversityMetric.WeightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, normalized=True, ) else: return skbio.diversity.beta_diversity( BetaDiversityMetric.UnweightedUnifrac, df, df.index, tree=new_tree, otu_ids=tax_ids, )