def test_unweighted_unifrac_symmetry(self): for i in range(len(self.table1)): for j in range(len(self.table1)): actual = unweighted_unifrac( self.table1[i], self.table1[j], self.oids1, self.t1) expected = unweighted_unifrac( self.table1[j], self.table1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_symmetry(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = unweighted_unifrac(self.b1[i], self.b1[j], self.oids1, self.t1) expected = unweighted_unifrac(self.b1[j], self.b1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = unweighted_unifrac(self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = unweighted_unifrac(self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_non_overlapping(self): # these communities only share the root node actual = unweighted_unifrac( self.table1[4], self.table1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_non_overlapping(self): # these communities only share the root node actual = unweighted_unifrac(self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac([1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_zero_counts(self): actual = unweighted_unifrac([1, 1, 1, 0, 0], [0, 0, 0, 0, 0], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac([0, 0, 0, 0, 0], [0, 0, 0, 0, 0], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac([], [], [], self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_minimal_trees(self): # expected values computed by hand # zero tips tree = TreeNode.read(StringIO(u'root;')) actual = unweighted_unifrac([], [], [], tree) expected = 0.0 self.assertEqual(actual, expected) # two tips tree = TreeNode.read(StringIO(u'(OTU1:0.25, OTU2:0.25)root;')) actual = unweighted_unifrac([1, 0], [0, 0], ['OTU1', 'OTU2'], tree) expected = 1.0 self.assertEqual(actual, expected)
def test_unweighted_unifrac_zero_counts(self): actual = unweighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 0, 0], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( [], [], [], self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected)
def compute_beta_unifraq(df, m): from skbio.diversity.beta import unweighted_unifrac, weighted_unifrac # get the phylogenetic tree and drop OTUs that are not NCBI annotated otu_ids = list(df.index.values) tree, notfound = load_taxonomy_tree(otu_ids) df = df.drop(list(notfound)) # The numpy matrix of counts, in which the rows are sample counts mt = df.values.T.astype(np.dtype('int64')) # Sample list sl = list(df.columns.values) # OTU list otu_ids = list(df.index.values) # The beta diversity matrix nsamples = len(sl) bm = np.zeros((nsamples, nsamples)) # Compute the pairwise unifraq for i in range(nsamples): for j in range(i): u_counts = mt[i] v_counts = mt[j] if m == "unifraq": uu = unweighted_unifrac(u_counts, v_counts, otu_ids, tree) if m == "wunifraq": uu = weighted_unifrac(u_counts, v_counts, otu_ids, tree) bm[i, j] = uu bm[j, i] = uu return bm
def test_unweighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = unweighted_unifrac(self.b1[i], self.b1[j], self.oids1, self.t1) expected = unweighted_unifrac(shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = unweighted_unifrac( shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected)
def test_unweighted_minimal_trees(self): # two tips tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = unweighted_unifrac([1, 0], [0, 0], ['OTU1', 'OTU2'], tree) expected = 1.0 self.assertEqual(actual, expected)
def test_unweighted_unifrac_qiime_tiny_test(self): dm_fp = get_data_path( os.path.join('qiime-191-tt', 'unweighted_unifrac_dm.txt'), 'data') expected = DistanceMatrix.read(dm_fp) for sid1 in self.q_table.columns: for sid2 in self.q_table.columns: actual = unweighted_unifrac( self.q_table[sid1], self.q_table[sid2], otu_ids=self.q_table.index, tree=self.q_tree) self.assertAlmostEqual(actual, expected[sid1, sid2])
def test_unweighted_root_not_observed(self): # expected values computed with QIIME 1.9.1 and by hand # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = unweighted_unifrac([1, 1, 0, 0], [1, 0, 0, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.2 / (0.1 + 0.2 + 0.3) # 0.3333333333 self.assertAlmostEqual(actual, expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = unweighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.7 / (1.1 + 0.5 + 0.7) # 0.3043478261 self.assertAlmostEqual(actual, expected)
def get_unifrac( otu_file_1: pathlib.Path, otu_file_2: pathlib.Path, tree_file: pathlib.Path, weighted: bool, threshold: int, ): otu_1 = load_table(str(otu_file_1)).to_dataframe(dense=True) otu_2 = load_table(str(otu_file_2)).to_dataframe(dense=True) tree = TreeNode.read(str(tree_file)) unifrac_data = dict() for u, v, otu_ids, col in get_vectors(otu_1, otu_2, threshold): if weighted: unifrac_value = weighted_unifrac( u, v, otu_ids, tree, normalized=True, validate=True ) else: unifrac_value = unweighted_unifrac(u, v, otu_ids, tree, validate=True) unifrac_data[col] = unifrac_value return pd.Series(unifrac_data), otu_1.shape[0], otu_2.shape[0]
from io import StringIO from skbio import TreeNode from skbio.diversity.beta import unweighted_unifrac tree = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,' '(OTU4:0.75,(OTU5:0.5,((OTU6:0.33,OTU7:0.62):0.5' ',OTU8:0.5):0.5):0.5):1.25):0.0)root;')) u_counts = [1, 0, 0, 4, 1, 2, 3, 0] v_counts = [0, 1, 1, 6, 0, 1, 0, 0] otu_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7', 'OTU8'] uu = unweighted_unifrac(u_counts, v_counts, otu_ids, tree) print(round(uu, 2)) from ete3 import PhyloTree t = PhyloTree('((H,I), A, (B,(C,D)))root;', format=1) print(t) D = t & "D" # Get the path from B to the root node = D path = [] while node.up: path.append(node) node = node.up # I substract D node from the total number of visited nodes print("There are", len(path) - 1, "nodes between D and the root") A = t & "A" # Get the path from B to the root node = A
def test_unweighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # unweighted unifrac implementation # sample A versus all actual = unweighted_unifrac(self.b1[0], self.b1[1], self.oids1, self.t1) expected = 0.238095238095 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[0], self.b1[2], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[0], self.b1[3], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[0], self.b1[4], self.oids1, self.t1) expected = 0.545454545455 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[0], self.b1[5], self.oids1, self.t1) expected = 0.619047619048 self.assertAlmostEqual(actual, expected) # sample B versus remaining actual = unweighted_unifrac(self.b1[1], self.b1[2], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[1], self.b1[3], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[1], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[1], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample C versus remaining actual = unweighted_unifrac(self.b1[2], self.b1[3], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[2], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[2], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample D versus remaining actual = unweighted_unifrac(self.b1[3], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac(self.b1[3], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample E versus remaining actual = unweighted_unifrac(self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_identity(self): for i in range(len(self.b1)): actual = unweighted_unifrac(self.b1[i], self.b1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # unweighted unifrac implementation # sample A versus all actual = unweighted_unifrac( self.table1[0], self.table1[1], self.oids1, self.t1) expected = 0.238095238095 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[0], self.table1[2], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[0], self.table1[3], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[0], self.table1[4], self.oids1, self.t1) expected = 0.545454545455 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[0], self.table1[5], self.oids1, self.t1) expected = 0.619047619048 self.assertAlmostEqual(actual, expected) # sample B versus remaining actual = unweighted_unifrac( self.table1[1], self.table1[2], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[1], self.table1[3], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[1], self.table1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[1], self.table1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample C versus remaining actual = unweighted_unifrac( self.table1[2], self.table1[3], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[2], self.table1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[2], self.table1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample D versus remaining actual = unweighted_unifrac( self.table1[3], self.table1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = unweighted_unifrac( self.table1[3], self.table1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample E versus remaining actual = unweighted_unifrac( self.table1[4], self.table1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected)
def test_unweighted_unifrac_kwargs(self): # confirm that **kwargs can be passed actual = unweighted_unifrac(self.table1[0], self.table1[0], self.oids1, self.t1, not_a_known_parameter=42) self.assertAlmostEqual(actual, 0.0)
def test_unweighted_unifrac_identity(self): for i in range(len(self.table1)): actual = unweighted_unifrac( self.table1[i], self.table1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected)