Esempio n. 1
0
 def test_match_inputs_ignore_missing_samples_error(self):
     bad_table = self.table.copy()
     # Replace one of the sample IDs in the table with some junk
     bad_table.columns = ["Sample1", "Sample2", "Whatever", "Sample4"]
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "The feature table contains samples that aren't present in the "
         "sample metadata."
     ):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
Esempio n. 2
0
 def test_match_inputs_filter_missing_features_error(self):
     # Replace one of the tip IDs in the table with an internal node ID,
     # instead. This isn't ok.
     bad_table = biom.Table(self.table.matrix_data, list('abeg'),
                            self.table.ids())
     with self.assertRaisesRegex(
             tools.DataMatchingError,
             "The feature table contains features that aren't present as tips "
             "in the tree."):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
Esempio n. 3
0
 def test_match_inputs_ignore_missing_samples_error(self):
     # Replace one of the sample IDs in the table with some junk
     bad_table = biom.Table(self.table.matrix_data,
                            self.table.ids(axis='observation'),
                            ['Sample1', 'Sample2', 'Whatever', 'Sample4'])
     with self.assertRaisesRegex(
             tools.DataMatchingError,
             "The feature table contains samples that aren't present in the "
             "sample metadata."):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
Esempio n. 4
0
    def test_disjoint_table_and_ordination(self):
        self.ordination.samples.index = pd.Index(['Zample1', 'Zample2',
                                                  'Zample3', 'Zample4'])

        with self.assertRaisesRegex(
            tools.DataMatchingError,
            "No samples in the feature table are present in the ordination"
        ):
            tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
                               ordination=self.ordination)
Esempio n. 5
0
 def test_match_inputs_filter_missing_features_error(self):
     bad_table = self.table.copy()
     # Replace one of the tip IDs in the table with an internal node ID,
     # instead. This isn't ok.
     bad_table.index = ["a", "b", "e", "g"]
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "The feature table contains features that aren't present as tips "
         "in the tree."
     ):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
Esempio n. 6
0
    def test_ordination_is_superset(self):
        table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3'])

        with self.assertRaisesRegex(
                tools.DataMatchingError,
                "The ordination has more samples than the feature table"):
            tools.match_inputs(self.bp_tree,
                               table,
                               self.sample_metadata,
                               ordination=self.ordination)
Esempio n. 7
0
 def test_match_inputs_feature_metadata_no_features_in_tree(self):
     """Tests that feature names not corresponding to internal nodes / tips
        in the tree are filtered out of the feature metadata, and that if
        all features in the input feature metadata are filtered that an
        error is raised.
     """
     bad_fm = self.feature_metadata.copy()
     bad_fm.index = range(len(self.feature_metadata.index))
     with self.assertRaisesRegex(
             tools.DataMatchingError,
         ("No features in the feature metadata are present in the tree, "
          "either as tips or as internal nodes.")):
         tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
                            bad_fm)
Esempio n. 8
0
    def test_table_is_superset_raises(self):
        table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [1, 0, 0, 0],
                      [1, 0, 4, 0]]).T, list('abed'),
            ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5'])

        with self.assertRaisesRegex(
                tools.DataMatchingError,
                "The feature table has more samples than the ordination. These are"
                " the problematic sample identifiers: Sample5. You can override "
                "this error by using the --p-filter-extra-samples flag"):
            tools.match_inputs(self.bp_tree,
                               table,
                               self.sample_metadata,
                               ordination=self.ordination)
Esempio n. 9
0
    def test_match_inputs_feature_metadata_nothing_dropped(self):
        """Tests that tip/internal node names allowed as entries in feat. md.

           (self.feature_metadata describes three features, "e", "h", and "a".
            h is an internal node in self.tree, and e and a are tips.)
        """
        (f_table, f_sample_metadata, tip_md, int_md,
         taxcols) = tools.match_inputs(self.bp_tree, self.table,
                                       self.sample_metadata,
                                       self.feature_metadata)
        self.assertEqual(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)
        # Check that no filtering had to be done -- only differences in output
        # and input feature metadata should be that 1) the output is split into
        # two DataFrames, one for tip and one for internal node metadata, and
        # 2) the taxonomy column was split up.
        assert_frame_equal(tip_md,
                           self.split_tax_fm.loc[["e", "a"]],
                           check_like=True)
        assert_frame_equal(int_md, self.split_tax_fm.loc[["h"]])
        # Check that the tip + internal node metadata have identical columns
        self.assertListEqual(list(tip_md.columns), self.exp_split_fm_cols)
        self.assertListEqual(list(int_md.columns), self.exp_split_fm_cols)
        # Check that the split-up taxonomy columns look good
        assert_taxcols_ok(taxcols)
Esempio n. 10
0
 def test_match_inputs_feature_metadata_some_features_dropped(self):
     """Tests the filtering case described above, but with not all
        feature(s) in the feature metadata getting filtered out.
     """
     # Manipulate bad_fm so that only the "e" feature should get preserved
     # (since it's actually in the tree, while "asdf" and "hjkl" aren't)
     bad_fm = self.feature_metadata.copy()
     bad_fm.index = ["e", "asdf", "hjkl"]
     f_table, f_sample_metadata, t_fm, i_fm, taxcols = tools.match_inputs(
         self.bp_tree, self.table, self.sample_metadata, bad_fm)
     self.assertEqual(f_table, self.table)
     assert_frame_equal(f_sample_metadata, self.sample_metadata)
     # Check that the feature metadata just describes "e" (which should be
     # in the tip metadata)
     assert_frame_equal(t_fm, self.split_tax_fm.loc[["e"]])
     # ... and check that the internal node metadata is empty.
     self.assertEqual(len(i_fm.index), 0)
     # Columns should be the same between tip and internal md, though.
     # (It shouldn't really make a difference, since the empty internal
     # metadata will be represented as an empty dict/JSON object ({}) in
     # the generated HTML... but may as well check.)
     self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
     self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)
     # Check that the split-up taxonomy columns look good
     assert_taxcols_ok(taxcols)
Esempio n. 11
0
    def test_table_is_superset_override_raises(self):
        table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [1, 0, 0, 0],
                      [1, 0, 4, 0]]).T, list('abed'),
            ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5'])

        (filtered_table, filtered_sample_md, t_md, i_md,
         taxcols) = tools.match_inputs(self.bp_tree,
                                       table,
                                       self.sample_metadata,
                                       ordination=self.ordination,
                                       filter_extra_samples=True)

        # NOTE: even though 'e' is now empty, it isn't removed now; it'll be
        # removed later on, in remove_empty_samples_and_features().
        exp = table.filter(set(table.ids()) - {
            'Sample5',
        }, inplace=False)

        # guarantee the same sample-wise order
        self.assertEqual(filtered_table, exp)
        assert_frame_equal(filtered_sample_md.loc[exp.ids()],
                           self.sample_metadata)

        # We didn't pass in any feature metadata, so we shouldn't get any out
        self.assertIsNone(t_md)
        self.assertIsNone(i_md)
        self.assertEqual(taxcols, [])
Esempio n. 12
0
 def test_match_inputs_filter_missing_features_override(self):
     """Checks that --p-filter-missing-features works as expected."""
     # The inputs are the same as with the above test
     bad_table = biom.Table(self.table.matrix_data, list('abeg'),
                            self.table.ids())
     out_table = None
     out_sm = None
     with self.assertWarnsRegex(
             tools.DataMatchingWarning,
             # The parentheses mess up the regex, hence the necessity for using
             # raw strings ._.
         (r"1 feature\(s\) in the table were not present as tips in "
          r"the tree. These feature\(s\) have been removed from the "
          "visualization.")):
         out_table, out_sm, tm, im, taxcols = tools.match_inputs(
             self.bp_tree,
             bad_table,
             self.sample_metadata,
             filter_missing_features=True)
     self.assertCountEqual(out_table.ids(axis='observation'),
                           ["a", "b", "e"])
     # Just to check, make sure the rest of the table is ok
     self.assertEqual(
         out_table,
         self.table.filter({'a', 'b', 'e'},
                           axis='observation',
                           inplace=False))
     # ... and that the sample metadata is ok
     assert_frame_equal(out_sm, self.sample_metadata)
     # We didn't pass in any feature metadata, so we shouldn't get any out
     self.assertIsNone(tm)
     self.assertIsNone(im)
     self.assertEqual(taxcols, [])
Esempio n. 13
0
    def test_match_inputs_feature_metadata_duplicate_name_internal_node(self):
        """Tests that feature metadata for internal nodes with duplicate names
           is preserved.

           In the JS interface, there are two options for coloring nodes by
           feature metadata: 1) just coloring tips (and propagating
           clades with uniform feature metadata upwards), or 2) coloring all
           nodes with feature metadata, which can include internal nodes. In
           2), internal nodes with the same name will have the same feature
           metadata color.
        """
        # Slightly modified version of self.tree with duplicate internal node
        # names (i and g)
        t = parse_newick('(((a:1,e:2)i:1,b:2)g:1,(:1,d:3)g:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["a", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm
        )
        assert_frame_equal(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # Main point of this test: all of the feature metadata should have been
        # kept, even though g and i were both duplicate node names.
        assert_frame_equal(t_fm, split_fm.loc[["a"]])
        assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)
Esempio n. 14
0
 def test_match_inputs_filter_missing_features_override(self):
     """Checks that --p-filter-missing-features works as expected."""
     # The inputs are the same as with the above test
     bad_table = self.table.copy()
     bad_table.index = ["a", "b", "e", "g"]
     out_table = None
     out_sm = None
     with self.assertWarnsRegex(
         tools.DataMatchingWarning,
         # The parentheses mess up the regex, hence the necessity for using
         # raw strings ._.
         (
             r"1 feature\(s\) in the table were not present as tips in "
             r"the tree. These feature\(s\) have been removed from the "
             "visualization."
         )
     ):
         out_table, out_sm, tm, im = tools.match_inputs(
             self.bp_tree, bad_table, self.sample_metadata,
             filter_missing_features=True
         )
     self.assertCountEqual(out_table.index, ["a", "b", "e"])
     # Just to check, make sure the rest of the table is ok
     assert_frame_equal(
         out_table, self.table.loc[["a", "b", "e"]], check_like=True
     )
     # ... and that the sample metadata is ok
     assert_frame_equal(
         out_sm, self.sample_metadata
     )
Esempio n. 15
0
 def test_match_inputs_nothing_dropped(self):
     filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
         self.bp_tree, self.table, self.sample_metadata)
     self.assertEqual(filtered_table, self.table)
     assert_frame_equal(filtered_sample_md, self.sample_metadata)
     # We didn't pass in any feature metadata, so we shouldn't get any out
     self.assertIsNone(t_md)
     self.assertIsNone(i_md)
Esempio n. 16
0
 def test_match_inputs_no_shared_samples(self):
     bad_sample_metadata = self.sample_metadata.copy()
     bad_sample_metadata.index = ["lol", "nothing", "here", "matches"]
     with self.assertRaisesRegex(
             tools.DataMatchingError,
             "No samples in the feature table are present in the sample "
             "metadata."):
         tools.match_inputs(self.bp_tree, self.table, bad_sample_metadata)
     # Check that --p-ignore-missing-samples still doesn't work to override
     # this, since there are NO matching samples at all
     with self.assertRaisesRegex(
             tools.DataMatchingError,
             "No samples in the feature table are present in the sample "
             "metadata."):
         tools.match_inputs(self.bp_tree,
                            self.table,
                            bad_sample_metadata,
                            ignore_missing_samples=True)
Esempio n. 17
0
 def test_match_inputs_no_tips_in_table(self):
     bad_table = self.table.copy()
     bad_table.index = range(len(self.table.index))
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "No features in the feature table are present as tips in the tree."
     ):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
     # Check that --p-filter-missing-features still doesn't work to override
     # this, since there are NO matching features at all
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "No features in the feature table are present as tips in the tree."
     ):
         tools.match_inputs(
             self.bp_tree, bad_table, self.sample_metadata,
             filter_missing_features=True
         )
Esempio n. 18
0
 def test_match_inputs_only_1_feature_in_table(self):
     # This is technically allowed (so long as this 1 feature is a tree tip)
     tiny_table = self.table.loc[["a"]]
     filtered_tiny_table, filtered_sample_md, tm, im = tools.match_inputs(
         self.bp_tree, tiny_table, self.sample_metadata
     )
     assert_frame_equal(filtered_tiny_table, tiny_table)
     assert_frame_equal(filtered_sample_md, self.sample_metadata)
     self.assertIsNone(tm)
     self.assertIsNone(im)
Esempio n. 19
0
 def test_match_inputs_only_1_feature_in_table(self):
     # This is technically allowed (so long as this 1 feature is a tree tip)
     tiny_table = self.table.filter({"a", }, axis='observation',
                                    inplace=False)
     filtered_tiny_table, filtered_sample_md, tm, im = tools.match_inputs(
         self.bp_tree, tiny_table, self.sample_metadata
     )
     self.assertEqual(filtered_tiny_table, tiny_table)
     assert_frame_equal(filtered_sample_md, self.sample_metadata)
     self.assertIsNone(tm)
     self.assertIsNone(im)
Esempio n. 20
0
 def test_match_inputs_no_tips_in_table(self):
     bad_table = self.table.copy()
     bad_table.update_ids({i: idx for idx, i in
                           enumerate(bad_table.ids(axis='observation'))},
                          axis='observation', inplace=True)
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "No features in the feature table are present as tips in the tree."
     ):
         tools.match_inputs(self.bp_tree, bad_table, self.sample_metadata)
     # Check that --p-filter-missing-features still doesn't work to override
     # this, since there are NO matching features at all
     with self.assertRaisesRegex(
         tools.DataMatchingError,
         "No features in the feature table are present as tips in the tree."
     ):
         tools.match_inputs(
             self.bp_tree, bad_table, self.sample_metadata,
             filter_missing_features=True
         )
Esempio n. 21
0
    def _validate_and_match_data(self, ignore_missing_samples,
                                 filter_extra_samples,
                                 filter_missing_features,
                                 shear_to_table,
                                 shear_to_feature_metadata):

        if self.is_community_plot:
            self.table, self.samples, self.tip_md, self.int_md = match_inputs(
                self.tree, self.table, self.samples, self.features,
                self.ordination, ignore_missing_samples, filter_extra_samples,
                filter_missing_features
            )
            # Remove empty samples and features from the table (and remove the
            # removed samples from the sample metadata). We also pass in the
            # ordination, if present, to this function -- so we can throw an
            # error if the ordination actually contains these empty
            # samples/features.
            #
            # We purposefully do this removal *after* matching (so we know the
            # data inputs match up) and *before* shearing (so empty features
            # in the table are no longer included as tips in the tree).
            self.table, self.samples = remove_empty_samples_and_features(
                self.table, self.samples, self.ordination
            )
            # remove unobserved features from the phylogeny (shear the tree)
            if shear_to_table:
                features = set(self.table.ids(axis='observation'))
                self.tree = self.tree.shear(features)
                # Remove features in the feature metadata that are no longer
                # present in the tree, due to being shorn off
                if self.tip_md is not None or self.int_md is not None:
                    # (Technically they should always both be None or both be
                    # DataFrames -- there's no in-between)
                    self.tip_md, self.int_md = filter_feature_metadata_to_tree(
                        self.tip_md, self.int_md, self.tree
                    )

        else:
            if shear_to_feature_metadata:
                features = set(self.features.index)
                all_tips = set(bp_tree_tips(self.tree))
                # check that feature metadata contains at least 1 tip
                if not features.intersection(all_tips):
                    raise ValueError(
                        "Cannot shear tree to feature metadata: no tips in "
                        "the tree are present in the feature metadata."
                    )
                self.tree = self.tree.shear(features)
            self.tip_md, self.int_md = match_tree_and_feature_metadata(
                self.tree, self.features
            )
        validate_tree(self.tree)
Esempio n. 22
0
    def test_match_inputs_nothing_dropped_with_ordination(self):
        # everything is the same since the ordination has a 1:1 match to the
        # feature table
        filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
            self.bp_tree, self.table, self.sample_metadata,
            ordination=self.ordination
        )

        self.assertEqual(filtered_table, self.table)
        assert_frame_equal(filtered_sample_md, self.sample_metadata)
        # We didn't pass in any feature metadata, so we shouldn't get any out
        self.assertIsNone(t_md)
        self.assertIsNone(i_md)
Esempio n. 23
0
    def test_match_inputs_ignore_missing_samples_override(self):
        """Checks that --p-ignore-missing-samples works as expected."""
        # These inputs are the same as with the above test
        bad_table = self.table.copy()
        # Replace one of the sample IDs in the table with some junk
        bad_table.columns = ["Sample1", "Sample2", "Whatever", "Sample4"]
        out_table = None
        out_sm = None
        with self.assertWarnsRegex(
            tools.DataMatchingWarning,
            (
                r"1 sample\(s\) in the table were not present in the sample "
                r"metadata. These sample\(s\) have been assigned placeholder "
                "metadata."
            )
        ):
            out_table, out_sm, tm, im = tools.match_inputs(
                self.bp_tree, bad_table, self.sample_metadata,
                ignore_missing_samples=True
            )

        self.assertCountEqual(
            out_table.columns,
            ["Sample1", "Sample2", "Whatever", "Sample4"]
        )
        self.assertCountEqual(
            out_sm.index,
            ["Sample1", "Sample2", "Whatever", "Sample4"]
        )
        # Make sure the table stays consistent
        assert_frame_equal(out_table, bad_table)
        # ...And that the placeholder metadata was added in for the "Whatever"
        # sample correctly
        self.assertTrue(
            (out_sm.loc["Whatever"] == "This sample has no metadata").all()
        )
        # ... And that, with the exception of the newly added placeholder
        # metadata, the sample metadata is also consistent. (The dtypes of
        # individual columns can change if placeholder metadata was added,
        # since the "This sample has no metadata" thing is just a string.)
        # (...And *that* shouldn't impact Empress since Empress stores all
        # sample metadata as strings. At least as of writing this.)
        assert_frame_equal(
            out_sm.loc[["Sample1", "Sample2", "Sample4"]],
            self.sample_metadata.loc[["Sample1", "Sample2", "Sample4"]],
            check_dtype=False
        )
Esempio n. 24
0
    def test_match_inputs_feature_metadata_root_metadata_allowed(self):
        """Tests that feature metadata for the root node is preserved."""
        # Slightly modified version of self.tree where root has a name (i)
        t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["a", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm)
        # (check that we didn't mess up the table / sample metadata matching by
        # accident)
        self.assertEqual(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # Main point of this test: all of the feature metadata should have been
        # kept, since a, g, and i are all included in the tree (i in particular
        # is important to verify, since it's the root)
        assert_frame_equal(t_fm, split_fm.loc[["a"]])
        assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)
Esempio n. 25
0
    def test_match_inputs_feature_metadata_only_internal_node_metadata(self):
        """Tests that feature metadata only for internal nodes is allowed."""
        # Slightly modified version of self.tree where root has a name (i)
        t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["h", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm)
        self.assertEqual(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # 1) Check that tip metadata is empty
        self.assertEqual(len(t_fm.index), 0)
        # 2) Check that internal node metadata was preserved
        assert_frame_equal(i_fm, split_fm.loc[fm.index], check_like=True)
        # 3) Check that columns on both DFs are identical
        self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
        self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)
Esempio n. 26
0
    def _validate_and_match_data(self, ignore_missing_samples,
                                 filter_missing_features,
                                 filter_unobserved_features_from_phylogeny):
        # remove unobserved features from the phylogeny
        if filter_unobserved_features_from_phylogeny:
            self.tree = self.tree.shear(set(self.table.columns))

        # extract balance parenthesis
        self._bp_tree = list(self.tree.B)

        self.tree = Tree.from_tree(to_skbio_treenode(self.tree))
        fill_missing_node_names(self.tree)

        # Note that the feature_table we get from QIIME 2 (as an argument to
        # this function) is set up such that the index describes sample IDs and
        # the columns describe feature IDs. We transpose this table before
        # sending it to tools.match_inputs() and keep using the transposed
        # table for the rest of this visualizer.
        self.table, self.samples, self.tip_md, self.int_md = match_inputs(
            self.tree, self.table.T, self.samples, self.features,
            ignore_missing_samples, filter_missing_features)