def test_match_inputs_ignore_missing_samples_error(self): t = Tree.from_tree(self.tree) bad_table = self.table.copy() # Replace one of the sample IDs in the table with some junk bad_table.columns = ["Sample1", "Sample2", "Whatever", "Sample4"] with self.assertRaisesRegex( tools.DataMatchingError, "The feature table contains samples that aren't present in the " "sample metadata." ): tools.match_inputs(t, bad_table, self.sample_metadata)
def test_match_inputs_only_1_feature_in_table(self): # This is technically allowed (so long as this 1 feature is a tree tip) t = Tree.from_tree(self.tree) tiny_table = self.table.loc[["a"]] filtered_tiny_table, filtered_sample_md, tm, im = tools.match_inputs( t, tiny_table, self.sample_metadata ) assert_frame_equal(filtered_tiny_table, tiny_table) assert_frame_equal(filtered_sample_md, self.sample_metadata) self.assertIsNone(tm) self.assertIsNone(im)
def test_nonroot_missing_branchlengths(self): # Note about the fourth test tree here: the reason this triggers a # missing-branch-length error before a negative-branch-length error is # because the tree is checked in postorder. This sort of "precedence" # can be changed in the future if desired. bad_newicks = [ '((b)a:1)root:1;', '((b:1)a)root:0;', '(b,c)a;', '((b)a:-1)root:3;', '((b:0,c)a:0)root:0;' ] for nwk in bad_newicks: st = TreeNode.read([nwk]) with self.assertRaisesRegex(ValueError, "must have lengths"): Tree.from_tree(st) # Check that roots *with* missing branch lengths don't trigger an error # on tree creation ok_newicks = ['((b:0,c:1)a:0)root;'] for nwk in ok_newicks: st = TreeNode.read([nwk]) Tree.from_tree(st)
def test_match_inputs_filter_missing_features_error(self): t = Tree.from_tree(self.tree) bad_table = self.table.copy() # Replace one of the tip IDs in the table with an internal node ID, # instead. This isn't ok. bad_table.index = ["a", "b", "e", "g"] with self.assertRaisesRegex( tools.DataMatchingError, "The feature table contains features that aren't present as tips " "in the tree." ): tools.match_inputs(t, bad_table, self.sample_metadata)
def test_missing_root_length_tree_rect_layout(self): """Like the above test, but checks that things still work ok when the root node has no assigned branch length. """ st = TreeNode.read(['((b:2)a:1)root;']) t = Tree.from_tree(st) t.coords(100, 100) expected_coords = [(100, 0.0), (100 / 3.0, 0.0), (0.0, 0.0)] self.check_coords(t, "xr", "yr", expected_coords) for node in t.non_tips(): self.assertEqual(node.lowest_child_yr, 0) self.assertEqual(node.highest_child_yr, 0) self.check_basic_tree_rect_layout(t)
def test_unrooted_layout(self): t = Tree.from_tree(self.tree) t.coords(500, 500) expected_coords = [(-10.222747306219219, 195.06163867407446), (118.00044943013512, 262.22444928198297), (36.73032180166217, 137.07942714215795), (184.76890317443747, 23.95196521134946), (40.6350638142365, 62.57251106991248), (-77.36538561589865, -199.6519382120705), (-290.23109682556253, -205.35762294073118), (-81.27012762847295, -125.14502213982503), (0.0, 0.0)] self.check_coords(t, "x2", "y2", expected_coords)
def test_straightline_tree_rect_layout(self): """Checks that all nodes are drawn as expected even when there aren't any "branches" in the tree. """ # Setting root length to 100 to demonstrate/verify that root length is # not taken into account (if this behavior changes we'll need to modify # this test, rightfully so) st = TreeNode.read(['((b:2)a:1)root:100;']) t = Tree.from_tree(st) t.coords(100, 100) expected_coords = [(100, 0.0), (100 / 3.0, 0.0), (0.0, 0.0)] self.check_coords(t, "xr", "yr", expected_coords) for node in t.non_tips(): self.assertEqual(node.lowest_child_yr, 0) self.assertEqual(node.highest_child_yr, 0) self.check_basic_tree_rect_layout(t)
def test_match_inputs_ignore_missing_samples_override(self): """Checks that --p-ignore-missing-samples works as expected.""" # These inputs are the same as with the above test t = Tree.from_tree(self.tree) bad_table = self.table.copy() # Replace one of the sample IDs in the table with some junk bad_table.columns = ["Sample1", "Sample2", "Whatever", "Sample4"] out_table = None out_sm = None with self.assertWarnsRegex( tools.DataMatchingWarning, ( r"1 sample\(s\) in the table were not present in the sample " r"metadata. These sample\(s\) have been assigned placeholder " "metadata." ) ): out_table, out_sm, tm, im = tools.match_inputs( t, bad_table, self.sample_metadata, ignore_missing_samples=True ) self.assertCountEqual( out_table.columns, ["Sample1", "Sample2", "Whatever", "Sample4"] ) self.assertCountEqual( out_sm.index, ["Sample1", "Sample2", "Whatever", "Sample4"] ) # Make sure the table stays consistent assert_frame_equal(out_table, bad_table) # ...And that the placeholder metadata was added in for the "Whatever" # sample correctly self.assertTrue( (out_sm.loc["Whatever"] == "This sample has no metadata").all() ) # ... And that, with the exception of the newly added placeholder # metadata, the sample metadata is also consistent. (The dtypes of # individual columns can change if placeholder metadata was added, # since the "This sample has no metadata" thing is just a string.) # (...And *that* shouldn't impact Empress since Empress stores all # sample metadata as strings. At least as of writing this.) assert_frame_equal( out_sm.loc[["Sample1", "Sample2", "Sample4"]], self.sample_metadata.loc[["Sample1", "Sample2", "Sample4"]], check_dtype=False )
def test_match_inputs_feature_metadata_no_features_in_tree(self): """Tests that feature names not corresponding to internal nodes / tips in the tree are filtered out of the feature metadata, and that if all features in the input feature metadata are filtered that an error is raised. """ t = Tree.from_tree(self.tree) bad_fm = self.feature_metadata.copy() bad_fm.index = range(len(self.feature_metadata.index)) with self.assertRaisesRegex( tools.DataMatchingError, ( "No features in the feature metadata are present in the tree, " "either as tips or as internal nodes." ) ): tools.match_inputs(t, self.table, self.sample_metadata, bad_fm)
def test_circular_layout(self): """Test to make sure the circular layout computes what we expect it to. For each node, circular layou computer the following things: (xc0, yc0) - the starting location for each node (xc1, yc1) - the ending location for each node Then, all non-root internal nodes, have an arc that connects the "starting points" of the children with the minimum and maximum angle: (arcx0, arcy0) - the starting location for the arc highest_child_clangle - the starting angle for the arc lowest_child_clangle - the ending angle for the arc """ st = TreeNode.read(["((d:4,c:3)b:2,a:1)root:1;"]) t = Tree.from_tree(st) t.coords(100, 100) # check starting location for each node # Note: nodes 'a' and 'b' should have the same starting coordinates # since they both start at the root. expected_start = [(38.490018, 0.0), (-19.245009, 33.333333), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)] self.check_coords(t, "xc0", "yc0", expected_start) # check ending location for each node expected_end = [(115.470054, 0.0), (-48.112522, 83.333333), (19.245009, 33.333333), (-9.622504, -16.666667), (0.0, 0.0)] self.check_coords(t, "xc1", "yc1", expected_end) # check starting location for b's arc expected_arc = [-19.245009, 33.333333] b = t.find("b") self.assertAlmostEqual(b.arcx0, expected_arc[0], places=5) self.assertAlmostEqual(b.arcy0, expected_arc[1], places=5) # check b's arc angles expected_angles = [2.0943951, 0.0] self.assertAlmostEqual(b.highest_child_clangle, expected_angles[0]) self.assertAlmostEqual(b.lowest_child_clangle, expected_angles[1])
def test_match_inputs_no_tips_in_table(self): t = Tree.from_tree(self.tree) bad_table = self.table.copy() bad_table.index = range(len(self.table.index)) with self.assertRaisesRegex( tools.DataMatchingError, "No features in the feature table are present as tips in the tree." ): tools.match_inputs(t, bad_table, self.sample_metadata) # Check that --p-filter-missing-features still doesn't work to override # this, since there are NO matching features at all with self.assertRaisesRegex( tools.DataMatchingError, "No features in the feature table are present as tips in the tree." ): tools.match_inputs( t, bad_table, self.sample_metadata, filter_missing_features=True )
def test_match_inputs_no_shared_samples(self): t = Tree.from_tree(self.tree) bad_sample_metadata = self.sample_metadata.copy() bad_sample_metadata.index = ["lol", "nothing", "here", "matches"] with self.assertRaisesRegex( tools.DataMatchingError, "No samples in the feature table are present in the sample " "metadata." ): tools.match_inputs(t, self.table, bad_sample_metadata) # Check that --p-ignore-missing-samples still doesn't work to override # this, since there are NO matching samples at all with self.assertRaisesRegex( tools.DataMatchingError, "No samples in the feature table are present in the sample " "metadata." ): tools.match_inputs( t, self.table, bad_sample_metadata, ignore_missing_samples=True )
def test_match_inputs_feature_metadata_root_metadata_allowed(self): """Tests that feature metadata for the root node is preserved.""" # Slightly modified version of self.tree where root has a name (i) t = Tree.from_tree( TreeNode.read(['(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;']) ) fm = self.feature_metadata.copy() fm.index = ["a", "g", "i"] f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs( t, self.table, self.sample_metadata, fm ) # (check that we didn't mess up the table / sample metadata matching by # accident) assert_frame_equal(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) split_fm = split_taxonomy(fm) # Main point of this test: all of the feature metadata should have been # kept, since a, g, and i are all included in the tree (i in particular # is important to verify, since it's the root) assert_frame_equal(t_fm, split_fm.loc[["a"]]) assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)
def test_match_inputs_feature_metadata_only_internal_node_metadata(self): """Tests that feature metadata only for internal nodes is allowed.""" # Slightly modified version of self.tree where root has a name (i) t = Tree.from_tree( TreeNode.read(['(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;']) ) fm = self.feature_metadata.copy() fm.index = ["h", "g", "i"] f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs( t, self.table, self.sample_metadata, fm ) assert_frame_equal(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) split_fm = split_taxonomy(fm) # 1) Check that tip metadata is empty self.assertEqual(len(t_fm.index), 0) # 2) Check that internal node metadata was preserved assert_frame_equal(i_fm, split_fm.loc[fm.index], check_like=True) # 3) Check that columns on both DFs are identical self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols) self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)
def test_match_inputs_feature_metadata_nothing_dropped(self): """Tests that tip/internal node names allowed as entries in feat. md. (self.feature_metadata describes three features, "e", "h", and "a". h is an internal node in self.tree, and e and a are tips.) """ t = Tree.from_tree(self.tree) f_table, f_sample_metadata, tip_md, int_md = tools.match_inputs( t, self.table, self.sample_metadata, self.feature_metadata ) assert_frame_equal(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) # Check that no filtering had to be done -- only differences in output # and input feature metadata should be that 1) the output is split into # two DataFrames, one for tip and one for internal node metadata, and # 2) the taxonomy column was split up. assert_frame_equal( tip_md, self.split_tax_fm.loc[["e", "a"]], check_like=True ) assert_frame_equal(int_md, self.split_tax_fm.loc[["h"]]) # Check that the tip + internal node metadata have identical columns self.assertListEqual(list(tip_md.columns), self.exp_split_fm_cols) self.assertListEqual(list(int_md.columns), self.exp_split_fm_cols)
def test_circular_layout_scaling_factor(self): """Checks to make sure the scaling factor applied at the end of the circular layout calculation preservers branch lengths. Basically a nodes length in the circular layout space should be proportional to its branch length. """ st = TreeNode.read(["((d:4,c:3)b:2,a:1)root:1;"]) t = Tree.from_tree(st) t.coords(100, 100) # All nodes' length (beside the root which is represented by a point) # in the circular layout space should have roughly the # same proportional length compared to their branch length. # # For example, in the above tree, if d's length in the circular layout # space is 1.5x larger than its branch length than all nodes should be # roughly 1.5x larger than their branch lengths. test_prop = None for n in t.preorder(include_self=False): n_prop = sqrt((n.xc1 - n.xc0)**2 + (n.yc1 - n.yc0)**2) / n.length if test_prop is None: test_prop = n_prop else: self.assertAlmostEqual(test_prop, n_prop, places=5)
def test_from_tree_node_starts_with_EmpressNode(self): t = TreeNode.read(['((a:1,b:3)c:2,EmpressNode1:5)e:2;']) with self.assertRaisesRegex( ValueError, 'Node names can\'t start with "EmpressNode"'): Tree.from_tree(t)
def test_from_tree_duplicate_tip_names(self): t = TreeNode.read(['((i:1,a:3)b:2,i:5)r:2;']) with self.assertRaisesRegex(ValueError, "Tip names in the tree must be unique"): Tree.from_tree(t)
def test_from_tree_singlenode(self): st = TreeNode.read(['i:1;']) with self.assertRaisesRegex(ValueError, "must contain at least 2 nodes"): Tree.from_tree(st)
def test_rectangular_layout(self): t = Tree.from_tree(self.tree) t.coords(500, 500) # Why do these coordinates look like this for such a simple tree? # There are a few steps. # # 1. Compute initial y-coordinates of layout: tips are assigned to # y=0, y=1, y=2, ... up to y=|tips|, and internal nodes are # positioned at the average of their childrens' y-positions. # # 2. Compute initial x-coordinates of layout: root starts at x=0, and # each child C with parent P is assigned x = P.x + C.branch_length. # (...those aren't real attribute names, this is just pseudocode) # # 3. Positions are scaled relative to the maximum width and height. # With this example tree, there are 5 tips so the maximum height is # 4 (since heights are 0-indexed), and the "farthest right" node is # d (at x=5). So we scale y-positions by 500 / 4 = 125, and we # scale x-positions by 500 / 5 = 100. (The "500"s are used here just # because these are the dimensions passed to coords().) # # 4. At this point we're done with Tree.layout_rectangular(), but # coords() still needs to alter coordinates to be relative to the # root node's coordinates. So every node's x-coordinate is # subtracted by the root's x=0 (this does nothing), and every node's # y-coordinate is subtracted by the root's y=(2.375*125)=296.875. # # So TLDR this is why a's coordinates go from (3, 0) on the first pass # to ((3 * 100) - 0, (0 * 125) - 296.875) = (300, -296.875) in the end. expected_coords = [ (300, -296.875), # a (400, -171.875), # e (200, -234.375), # f (300, -46.875), # b (100, -140.625), # g (300, 78.125), # c (500, 203.125), # d (200, 140.625), # h (0.0, 0.0) ] # i (root) self.check_coords(t, "xr", "yr", expected_coords) # Check that lowest_child_yr and highest_child_yr attributes were set # properly. We do this by iterating over tree.non_tips(), which (like # check_coords()) also uses a post-order traversal. # (Note that the "coordinates" in this list of 2-tuples are ordered as # (lowest child y-coordinate, highest child y-coordinate). Computing # these from the list above should be pretty simple.) expected_lowesthighest_child_yr = [ (-296.875, -171.875), # f (-234.375, -46.875), # g (78.125, 203.125), # h (-140.625, 140.625) ] # i for i, node in enumerate(t.non_tips()): l, h = expected_lowesthighest_child_yr[i] self.assertTrue(hasattr(node, "lowest_child_yr")) self.assertTrue(hasattr(node, "highest_child_yr")) self.assertAlmostEqual(node.lowest_child_yr, l, places=5) self.assertAlmostEqual(node.highest_child_yr, h, places=5) # ... And also check that tip nodes *don't* have these attributes, # since tips don't have children. for node in t.tips(): self.assertFalse(hasattr(node, "lowest_child_yr")) self.assertFalse(hasattr(node, "highest_child_yr"))