Ejemplos de split_taxonomy en Python, ejemplos de empress.taxonomy_utils.split_taxonomy en Python

Ejemplo n.º 1

0

Mostrar archivo

 def test_split_taxonomy_invalid_level_column(self):
     bad_fm = self.feature_metadata.copy()
     bad_fm.columns = ["Taxonomy", "Level 20"]
     with self.assertRaisesRegex(
             tax_utils.TaxonomyError,
         ("The feature metadata contains a taxonomy column, but also "
          r"already contains column\(s\) starting with the text 'Level' "
          r"\(case insensitive\).")):
         tax_utils.split_taxonomy(bad_fm)

Ejemplo n.º 2

0

Mostrar archivo

 def test_split_taxonomy_multiple_tax_columns(self):
     bad_fm = self.feature_metadata.copy()
     bad_fm.columns = ["Taxonomy", "taxon"]
     # As with above, parentheses mess up regexes -- raw strings fix that
     with self.assertRaisesRegex(
             tax_utils.TaxonomyError,
         ("Multiple columns in the feature metadata have one of the "
          r"following names \(case insensitive\): "
          r"\('taxon', 'taxonomy'\). At most one feature metadata "
          "column can have a name from that list.")):
         tax_utils.split_taxonomy(bad_fm)

Ejemplo n.º 3

0

Mostrar archivo

 def test_split_taxonomy_SILVA_annotation(self):
     """Tests that a particular taxonomy annotation that has caused
        errors with QIIME 2 in the past is split properly.
     """
     fm = pd.DataFrame(
         {
             "Taxonomy": [("D_0__Bacteria;D_1__Gemmatimonadetes;"
                           "D_2__Gemmatimonadetes;D_3__Gemmatimonadales;"
                           "D_4__Gemmatimonadaceae;D_5__Gemmatimonas;"
                           "D_6__uncultured bacterium ")]
         },
         index=["f0"])
     split_fm, taxcols = tax_utils.split_taxonomy(fm)
     assert_series_equal(
         split_fm.loc["f0"],
         pd.Series(
             {
                 "Level 1": "D_0__Bacteria",
                 "Level 2": "D_1__Gemmatimonadetes",
                 "Level 3": "D_2__Gemmatimonadetes",
                 "Level 4": "D_3__Gemmatimonadales",
                 "Level 5": "D_4__Gemmatimonadaceae",
                 "Level 6": "D_5__Gemmatimonas",
                 "Level 7": "D_6__uncultured bacterium",
             },
             name="f0"))
     assert_taxcols_ok(taxcols)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_taxonomy_utils.py Proyecto: sjanssen2/empress

 def test_split_taxonomy_leading_trailing_whitespace(self):
     """Tests that taxonomy strings with leading/trailing whitespace are
        handled as expected (i.e. this whitespace is stripped).
     """
     # We insert a bunch of whitespace around the taxonomy info (and in the
     # case of f3, around each level), but the actual information remains
     # the same as with the basic test case: so the levels should be the
     # same.
     funky_fm = self.feature_metadata.copy()
     funky_fm.loc["f1", "Taxonomy"] = funky_fm.loc["f1", "Taxonomy"] + "  "
     funky_fm.loc["f2", "Taxonomy"] = " " + funky_fm.loc["f2", "Taxonomy"]
     funky_fm.loc["f3", "Taxonomy"] = (
         "     " + funky_fm.loc["f3", "Taxonomy"].replace(";", " ;") + " "
     )
     # This really should never happen in practice, since I believe that
     # QIIME 2's taxonomy format stores its data as a TSV file. Also having
     # a tab character in a taxonomy annotation sounds pretty sketchy to me.
     # However, we may as well test that -- if for example QIIME 2 switches
     # to CSV instead of TSV -- leading/trailing tabs are still treated as
     # leading/trailing whitespace and are therefore ignored.
     funky_fm.loc["f4", "Taxonomy"] = (
         " \t " + funky_fm.loc["f4", "Taxonomy"] + "\t"
     )
     split_fm = tax_utils.split_taxonomy(funky_fm)
     self.check_basic_case_worked(split_fm)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_tools.py Proyecto: esayyari/empress

    def test_match_inputs_feature_metadata_duplicate_name_internal_node(self):
        """Tests that feature metadata for internal nodes with duplicate names
           is preserved.

           In the JS interface, there are two options for coloring nodes by
           feature metadata: 1) just coloring tips (and propagating
           clades with uniform feature metadata upwards), or 2) coloring all
           nodes with feature metadata, which can include internal nodes. In
           2), internal nodes with the same name will have the same feature
           metadata color.
        """
        # Slightly modified version of self.tree with duplicate internal node
        # names (i and g)
        t = parse_newick('(((a:1,e:2)i:1,b:2)g:1,(:1,d:3)g:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["a", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm
        )
        assert_frame_equal(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # Main point of this test: all of the feature metadata should have been
        # kept, even though g and i were both duplicate node names.
        assert_frame_equal(t_fm, split_fm.loc[["a"]])
        assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)

Ejemplo n.º 6

0

Mostrar archivo

    def setUp(self):
        self.tree = self.mock_tree_from_nwk()
        self.bp_tree = from_skbio_treenode(self.tree)
        self.table = biom.Table(
            np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0,
                                                                 0]]).T,
            list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4'])
        self.sample_metadata = pd.DataFrame(
            {
                "Metadata1": [0, 0, 0, 1],
                "Metadata2": [0, 0, 0, 0],
                "Metadata3": [1, 2, 3, 4],
                "Metadata4": ["abc", "def", "ghi", "jkl"]
            },
            index=list(self.table.ids()))

        # (These are some Greengenes taxonomy annotations I took from the
        # moving pictures taxonomy.qza file. I made up the confidences.)
        self.feature_metadata = pd.DataFrame(
            {
                "Taxonomy":
                [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__"),
                 ("k__Bacteria; p__Proteobacteria; "
                  "c__Gammaproteobacteria; o__Pasteurellales; "
                  "f__Pasteurellaceae; g__; s__"),
                 ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                  "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                  "s__uniformis")],
                "Confidence": [0.95, 0.8, 0]
            },
            index=["e", "h", "a"])
        self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata)
        self.tip_md = self.split_tax_fm.loc[["a", "e"]]
        self.int_md = self.split_tax_fm.loc[["h"]]
        # This is designed to match the shearing that's done in the core test
        # for --p-shear-to-table
        self.shorn_tree = parse_newick(
            "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;")
        self.exp_split_fm_cols = [
            "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6",
            "Level 7", "Confidence"
        ]

        eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3'])
        samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                   [0.4, 0.5, 0.6]]
        proportion_explained = pd.Series([15.5, 12.2, 8.8],
                                         index=['PC1', 'PC2', 'PC3'])
        samples_df = pd.DataFrame(
            samples,
            index=['Sample1', 'Sample2', 'Sample3', 'Sample4'],
            columns=['PC1', 'PC2', 'PC3'])
        self.ordination = OrdinationResults(
            'PCoA',
            'Principal Coordinate Analysis',
            eigvals,
            samples_df,
            proportion_explained=proportion_explained)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_taxonomy_utils.py Proyecto: sjanssen2/empress

    def test_split_taxonomy_all_rows_no_semicolons(self):
        funky_fm = self.feature_metadata.copy()
        funky_fm.loc["f1", "Taxonomy"] = "Bacteria"
        funky_fm.loc["f2", "Taxonomy"] = "Archaea"
        funky_fm.loc["f3", "Taxonomy"] = "Bacteria"
        funky_fm.loc["f4", "Taxonomy"] = "Viruses"
        with self.assertWarnsRegex(
            tax_utils.TaxonomyWarning,
            (
                "None of the taxonomy values in the feature metadata "
                r"contain a semicolon \(;\). Please make sure your taxonomy "
                'is formatted so that "levels" are separated by semicolons.'
            )
        ):
            split_fm = tax_utils.split_taxonomy(funky_fm)

        self.assertCountEqual(split_fm.columns, ["Level 1", "Confidence"])
        assert_series_equal(
            split_fm.loc["f1"],
            pd.Series({"Level 1": "Bacteria", "Confidence": 0.95}, name="f1")
        )
        assert_series_equal(
            split_fm.loc["f2"],
            pd.Series({"Level 1": "Archaea", "Confidence": 0.8}, name="f2")
        )
        assert_series_equal(
            split_fm.loc["f3"],
            pd.Series({"Level 1": "Bacteria", "Confidence": 0}, name="f3")
        )
        assert_series_equal(
            split_fm.loc["f4"],
            pd.Series({"Level 1": "Viruses", "Confidence": 1}, name="f4")
        )

Ejemplo n.º 8

0

Mostrar archivo

def match_tree_and_feature_metadata(bp_tree, feature_metadata=None):
    """Processes feature metadata and subsets it to nodes in the tree.

    NOTE: This function calls bp_tree_tips() on bp_tree. If this winds up
    being a bottleneck, we could add an extra optional parameter to this
    function where match_inputs() could pass the already-computed tip names
    here to avoid calling this function twice.

    Parameters
    ----------
    bp_tree: bp.BP
        The tree to be visualized.
    feature_metadata: pd.DataFrame or None
        Feature metadata. If this is passed, the index should describe node
        names in the tree and the columns should describe different feature
        metadata fields' names.

    Returns
    -------
    (tip_metadata, int_metadata): (pd.DataFrame or None, pd.DataFrame or None)
        If feature metadata was not passed, tip_metadata and int_metadata
        will both be None. Otherwise, tip_metadata will contain the
        entries of the feature metadata where the feature name was present
        as a tip in the tree, and int_metadata will contain the entries
        of the feature metadata where the feature name was present as
        internal node(s) in the tree. Also, this will call
        taxonomy_utils.split_taxonomy() on the feature metadata before
        splitting it up between tip and internal node feature mtadata.

    Raises
    ------
    DataMatchingError
        If feature_metadata is not None, but none of the names in its index
        correspond to any nodes in the tree.
    """
    tip_metadata = None
    int_metadata = None
    if feature_metadata is not None:
        # Split up taxonomy column, if present in the feature metadata
        ts_feature_metadata = taxonomy_utils.split_taxonomy(feature_metadata)
        fm_ids = ts_feature_metadata.index

        # Subset tip metadata
        fm_and_tip_features = fm_ids.intersection(bp_tree_tips(bp_tree))
        tip_metadata = ts_feature_metadata.loc[fm_and_tip_features]

        # Subset internal node metadata
        internal_node_names = set(bp_tree_non_tips(bp_tree))
        fm_and_int_features = fm_ids.intersection(internal_node_names)
        int_metadata = ts_feature_metadata.loc[fm_and_int_features]

        if len(tip_metadata.index) == 0 and len(int_metadata.index) == 0:
            # Error condition 5 in match_inputs_community_plot()
            raise DataMatchingError(
                "No features in the feature metadata are present in the tree, "
                "either as tips or as internal nodes."
            )
    return tip_metadata, int_metadata

Ejemplo n.º 9

0

Mostrar archivo

    def test_split_taxonomy_rows_with_no_semicolons(self):
        funky_fm = self.feature_metadata.copy()
        funky_fm.loc["f1", "Taxonomy"] = "birds aren't real"
        funky_fm.loc["f2", "Taxonomy"] = "theyve been drones"
        funky_fm.loc["f3", "Taxonomy"] = "all along :O"
        split_fm, taxcols = tax_utils.split_taxonomy(funky_fm)

        # Notice that f4's taxonomy is still there -- so each feature will have
        # 3 levels
        self.assertCountEqual(split_fm.columns,
                              ["Level 1", "Level 2", "Level 3", "Confidence"])

        assert_taxcols_ok(taxcols, exp_num_levels=3)

        # Check each row individually
        assert_series_equal(
            split_fm.loc["f1"],
            pd.Series(
                {
                    "Level 1": "birds aren't real",
                    "Level 2": "Unspecified",
                    "Level 3": "Unspecified",
                    "Confidence": 0.95
                },
                name="f1"))
        assert_series_equal(
            split_fm.loc["f2"],
            pd.Series(
                {
                    "Level 1": "theyve been drones",
                    "Level 2": "Unspecified",
                    "Level 3": "Unspecified",
                    "Confidence": 0.8
                },
                name="f2"))
        assert_series_equal(
            split_fm.loc["f3"],
            pd.Series(
                {
                    "Level 1": "all along :O",
                    "Level 2": "Unspecified",
                    "Level 3": "Unspecified",
                    "Confidence": 0
                },
                name="f3"))
        assert_series_equal(
            split_fm.loc["f4"],
            pd.Series(
                {
                    "Level 1": "k__Bacteria",
                    "Level 2": "p__Firmicutes",
                    "Level 3": "c__Bacilli",
                    "Confidence": 1
                },
                name="f4"))

Ejemplo n.º 10

0

Mostrar archivo

    def test_split_taxonomy_basic_case(self):
        initial_fm = self.feature_metadata.copy()
        split_fm, taxcols = tax_utils.split_taxonomy(initial_fm)

        # First off, check that initial_fm was NOT modified: the input DF
        # should remain untouched
        assert_frame_equal(self.feature_metadata, initial_fm)

        # Do all the in-depth testing of split_fm and taxcols in a utility
        # func (we'll reuse this code a bit later)
        self._check_basic_case_worked(split_fm, taxcols)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_tools.py Proyecto: esayyari/empress

 def setUp(self):
     self.tree = self.mock_tree_from_nwk()
     self.bp_tree = from_skbio_treenode(self.tree)
     # Test table/metadata (mostly) adapted from Qurro:
     # https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/tests/test_df_utils.py#L178
     self.table = pd.DataFrame(
         {
             "Sample1": [1, 2, 0, 4],
             "Sample2": [8, 7, 0, 5],
             "Sample3": [1, 0, 0, 0],
             "Sample4": [0, 0, 0, 0]
         },
         index=["a", "b", "e", "d"]
     )
     self.sample_metadata = pd.DataFrame(
         {
             "Metadata1": [0, 0, 0, 1],
             "Metadata2": [0, 0, 0, 0],
             "Metadata3": [1, 2, 3, 4],
             "Metadata4": ["abc", "def", "ghi", "jkl"]
         },
         index=list(self.table.columns)[:]
     )
     # (These are some Greengenes taxonomy annotations I took from the
     # moving pictures taxonomy.qza file. I made up the confidences.)
     self.feature_metadata = pd.DataFrame(
         {
             "Taxonomy": [
                 (
                     "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                     "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                     "s__"
                 ),
                 (
                     "k__Bacteria; p__Proteobacteria; "
                     "c__Gammaproteobacteria; o__Pasteurellales; "
                     "f__Pasteurellaceae; g__; s__"
                 ),
                 (
                     "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
                     "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                     "s__uniformis"
                 )
             ],
             "Confidence": [0.95, 0.8, 0]
         },
         index=["e", "h", "a"]
     )
     self.split_tax_fm = split_taxonomy(self.feature_metadata)
     self.exp_split_fm_cols = [
         "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6",
         "Level 7", "Confidence"
     ]

Ejemplo n.º 12

0

Mostrar archivo

    def test_match_inputs_feature_metadata_only_internal_node_metadata(self):
        """Tests that feature metadata only for internal nodes is allowed."""
        # Slightly modified version of self.tree where root has a name (i)
        t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["h", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm)
        self.assertEqual(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # 1) Check that tip metadata is empty
        self.assertEqual(len(t_fm.index), 0)
        # 2) Check that internal node metadata was preserved
        assert_frame_equal(i_fm, split_fm.loc[fm.index], check_like=True)
        # 3) Check that columns on both DFs are identical
        self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
        self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)

Ejemplo n.º 13

0

Mostrar archivo

    def test_match_inputs_feature_metadata_root_metadata_allowed(self):
        """Tests that feature metadata for the root node is preserved."""
        # Slightly modified version of self.tree where root has a name (i)
        t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;')
        fm = self.feature_metadata.copy()
        fm.index = ["a", "g", "i"]
        f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs(
            t, self.table, self.sample_metadata, fm)
        # (check that we didn't mess up the table / sample metadata matching by
        # accident)
        self.assertEqual(f_table, self.table)
        assert_frame_equal(f_sample_metadata, self.sample_metadata)

        split_fm = split_taxonomy(fm)
        # Main point of this test: all of the feature metadata should have been
        # kept, since a, g, and i are all included in the tree (i in particular
        # is important to verify, since it's the root)
        assert_frame_equal(t_fm, split_fm.loc[["a"]])
        assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)

Ejemplo n.º 14

0

Mostrar archivo

 def test_split_taxonomy_level_column_but_no_taxonomy_column(self):
     meh_fm = self.feature_metadata.copy()
     meh_fm.columns = ["I'm ambivalent!", "Level 20"]
     meh_fm2, taxcols = tax_utils.split_taxonomy(meh_fm)
     assert_frame_equal(meh_fm, meh_fm2)
     self.assertEqual(taxcols, [])

Ejemplo n.º 15

0

Mostrar archivo

 def test_split_taxonomy_no_tax_column(self):
     fm2 = self.feature_metadata.copy()
     fm2.columns = ["asdf", "ghjkl"]
     fm3, taxcols = tax_utils.split_taxonomy(fm2)
     assert_frame_equal(fm2, fm3)
     self.assertEqual(taxcols, [])

Ejemplo n.º 16

0

Mostrar archivo

def match_inputs(tree,
                 table,
                 sample_metadata,
                 feature_metadata=None,
                 ignore_missing_samples=False,
                 filter_missing_features=False):
    """Matches various input sources.

    Also "splits up" the feature metadata, first by calling
    taxonomy_utils.split_taxonomy() on it and then by splitting the resulting
    DataFrame into two separate DataFrames (one for tips and one for internal
    nodes).

    Parameters
    ----------

    tree: empress.tree.Tree
        The tree to be visualized.
    table: pd.DataFrame
        Representation of the feature table. The index should describe feature
        IDs; the columns should describe sample IDs. (It's expected that
        feature IDs in the table only describe tips in the tree, not internal
        nodes.)
    sample_metadata: pd.DataFrame
        Sample metadata. The index should describe sample IDs; the columns
        should describe different sample metadata fields' names.
    feature_metadata: pd.DataFrame or None
        Feature metadata. If this is passed, the index should describe feature
        IDs and the columns should describe different feature metadata fields'
        names. (Feature IDs here can describe tips or internal nodes in the
        tree.)
    ignore_missing_samples: bool
        If True, pads missing samples (i.e. samples in the table but not the
        metadata) with placeholder metadata. If False, raises a
        DataMatchingError if any such samples exist. (Note that in either case,
        samples in the metadata but not in the table are filtered out; and if
        no samples are shared between the table and metadata, a
        DataMatchingError is raised regardless.) This is analogous to the
        ignore_missing_samples flag in Emperor.
    filter_missing_features: bool
        If True, filters features from the table that aren't present as tips in
        the tree. If False, raises a DataMatchingError if any such features
        exist. (Note that in either case, features in the tree but not in the
        table are preserved.)

    Returns
    -------
    (table, sample_metadata, tip_metadata, int_metadata):
        (pd.DataFrame, pd.DataFrame, pd.DataFrame / None, pd.DataFrame / None)
        Versions of the input table, sample metadata, and feature metadata
        filtered such that:
            -The table only contains features also present as tips in the tree.
            -The sample metadata only contains samples also present in the
             table.
            -Samples present in the table but not in the sample metadata will
             have all of their sample metadata values set to "This sample has
             no metadata". (This will only be done if ignore_missing_samples is
             True; otherwise, this situation will trigger an error. See below.)
            -If feature metadata was not passed, tip_metadata and int_metadata
             will both be None. Otherwise, tip_metadata will contain the
             entries of the feature metadata where the feature name was present
             as a tip in the tree, and int_metadata will contain the entries
             of the feature metadata where the feature name was present as
             internal node(s) in the tree.
                -Also, for sanity's sake, this will call
                 taxonomy_utils.split_taxonomy() on the feature metadata before
                 splitting it up into tip and internal node metadata.

    Raises
    ------
    DataMatchingError
        If any of the following conditions are met:
            1. No features are shared between the tree's tips and table.
            2. There are features present in the table but not as tips in the
               tree, AND filter_missing_features is False.
            3. No samples are shared between the sample metadata and table.
            4. There are samples present in the table but not in the sample
               metadata, AND ignore_missing_samples is False.
            5. The feature metadata was passed, but no features present in it
               are also present as tips or internal nodes in the tree.

    References
    ----------
    This function was based on match_table_and_data() in Qurro's code:
    https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L255
    """
    # Match table and tree.
    # (Ignore None-named tips in the tree, which will be replaced later on
    # with "default" names like "EmpressNode0".)
    tip_names = set([n.name for n in tree.tips() if n.name is not None])
    tree_and_table_features = table.index.intersection(tip_names)

    if len(tree_and_table_features) == 0:
        # Error condition 1
        raise DataMatchingError(
            "No features in the feature table are present as tips in the tree."
        )

    ff_table = table.copy()
    if len(tree_and_table_features) < len(table.index):
        if filter_missing_features:
            # Filter table to just features that are also present in the tree.
            #
            # Note that we *don't* filter the tree analogously, because it's ok
            # for the tree's nodes to be a superset of the table's features
            # (and this is going to be the case in most datasets where the
            # features correspond to tips, since internal nodes aren't
            # explicitly described in the feature table).
            ff_table = table.loc[tree_and_table_features]

            # Report to user about any dropped features from table.
            dropped_feature_ct = table.shape[0] - ff_table.shape[0]
            warnings.warn(
                ("{} feature(s) in the table were not present as tips in "
                 "the tree. These feature(s) have been removed from the "
                 "visualization.").format(dropped_feature_ct),
                DataMatchingWarning)
        else:
            # Error condition 2
            raise DataMatchingError(
                "The feature table contains features that aren't present as "
                "tips in the tree. You can override this error by using the "
                "--p-filter-missing-features flag.")

    # Match table (post-feature-filtering, if done) and sample metadata.
    table_samples = set(ff_table.columns)
    sm_samples = set(sample_metadata.index)
    sm_and_table_samples = sm_samples & table_samples

    if len(sm_and_table_samples) == 0:
        # Error condition 3
        raise DataMatchingError(
            "No samples in the feature table are present in the sample "
            "metadata.")

    padded_metadata = sample_metadata.copy()
    if len(sm_and_table_samples) < len(ff_table.columns):
        if ignore_missing_samples:
            # Works similarly to how Emperor does this: see
            # https://github.com/biocore/emperor/blob/659b62a9f02a6423b6258c814d0e83dbfd05220e/emperor/core.py#L350
            samples_without_metadata = table_samples - sm_samples
            padded_metadata = pd.DataFrame(index=samples_without_metadata,
                                           columns=sample_metadata.columns,
                                           dtype=str)
            padded_metadata.fillna("This sample has no metadata", inplace=True)
            sample_metadata = pd.concat([sample_metadata, padded_metadata])
            # Report to user about samples we needed to "pad."
            warnings.warn(
                ("{} sample(s) in the table were not present in the "
                 "sample metadata. These sample(s) have been assigned "
                 "placeholder metadata.").format(
                     len(samples_without_metadata)), DataMatchingWarning)
        else:
            # Error condition 4
            raise DataMatchingError(
                "The feature table contains samples that aren't present in "
                "the sample metadata. You can override this error by using "
                "the --p-ignore-missing-samples flag.")

    # If we've made it this far, then there must be at least *one* sample
    # present in both the sample metadata and the table: and by this point the
    # metadata's samples should be a superset of the table's samples (since we
    # padded the metadata above if there were any samples that *weren't* in the
    # table).
    #
    # All that's left to do is to filter the sample metadata to just the
    # samples that are also present in the table.
    sf_sample_metadata = sample_metadata.loc[ff_table.columns]

    # If desired, we could report here to the user about any dropped samples
    # from the metadata by looking at the difference between
    # sample_metadata.shape[0] and sf_sample_metadata.shape[0]. However, the
    # presence of such "dropped samples" is a common occurrence in 16S studies,
    # so we currently don't do that for the sake of avoiding alarm fatigue.

    # If the feature metadata was passed, filter it so that it only contains
    # features present as tips / internal nodes in the tree
    tip_metadata = None
    int_metadata = None
    if feature_metadata is not None:
        # Split up taxonomy column, if present in the feature metadata
        ts_feature_metadata = taxonomy_utils.split_taxonomy(feature_metadata)
        fm_ids = ts_feature_metadata.index

        # Subset tip metadata
        fm_and_tip_features = fm_ids.intersection(tip_names)
        tip_metadata = ts_feature_metadata.loc[fm_and_tip_features]

        # Subset internal node metadata
        internal_node_names = set(
            [n.name for n in tree.non_tips(include_self=True)])
        fm_and_int_features = fm_ids.intersection(internal_node_names)
        int_metadata = ts_feature_metadata.loc[fm_and_int_features]

        if len(tip_metadata.index) == 0 and len(int_metadata.index) == 0:
            # Error condition 5
            raise DataMatchingError(
                "No features in the feature metadata are present in the tree, "
                "either as tips or as internal nodes.")

    return ff_table, sf_sample_metadata, tip_metadata, int_metadata