def test_split_taxonomy_invalid_level_column(self): bad_fm = self.feature_metadata.copy() bad_fm.columns = ["Taxonomy", "Level 20"] with self.assertRaisesRegex( tax_utils.TaxonomyError, ("The feature metadata contains a taxonomy column, but also " r"already contains column\(s\) starting with the text 'Level' " r"\(case insensitive\).")): tax_utils.split_taxonomy(bad_fm)
def test_split_taxonomy_multiple_tax_columns(self): bad_fm = self.feature_metadata.copy() bad_fm.columns = ["Taxonomy", "taxon"] # As with above, parentheses mess up regexes -- raw strings fix that with self.assertRaisesRegex( tax_utils.TaxonomyError, ("Multiple columns in the feature metadata have one of the " r"following names \(case insensitive\): " r"\('taxon', 'taxonomy'\). At most one feature metadata " "column can have a name from that list.")): tax_utils.split_taxonomy(bad_fm)
def test_split_taxonomy_SILVA_annotation(self): """Tests that a particular taxonomy annotation that has caused errors with QIIME 2 in the past is split properly. """ fm = pd.DataFrame( { "Taxonomy": [("D_0__Bacteria;D_1__Gemmatimonadetes;" "D_2__Gemmatimonadetes;D_3__Gemmatimonadales;" "D_4__Gemmatimonadaceae;D_5__Gemmatimonas;" "D_6__uncultured bacterium ")] }, index=["f0"]) split_fm, taxcols = tax_utils.split_taxonomy(fm) assert_series_equal( split_fm.loc["f0"], pd.Series( { "Level 1": "D_0__Bacteria", "Level 2": "D_1__Gemmatimonadetes", "Level 3": "D_2__Gemmatimonadetes", "Level 4": "D_3__Gemmatimonadales", "Level 5": "D_4__Gemmatimonadaceae", "Level 6": "D_5__Gemmatimonas", "Level 7": "D_6__uncultured bacterium", }, name="f0")) assert_taxcols_ok(taxcols)
def test_split_taxonomy_leading_trailing_whitespace(self): """Tests that taxonomy strings with leading/trailing whitespace are handled as expected (i.e. this whitespace is stripped). """ # We insert a bunch of whitespace around the taxonomy info (and in the # case of f3, around each level), but the actual information remains # the same as with the basic test case: so the levels should be the # same. funky_fm = self.feature_metadata.copy() funky_fm.loc["f1", "Taxonomy"] = funky_fm.loc["f1", "Taxonomy"] + " " funky_fm.loc["f2", "Taxonomy"] = " " + funky_fm.loc["f2", "Taxonomy"] funky_fm.loc["f3", "Taxonomy"] = ( " " + funky_fm.loc["f3", "Taxonomy"].replace(";", " ;") + " " ) # This really should never happen in practice, since I believe that # QIIME 2's taxonomy format stores its data as a TSV file. Also having # a tab character in a taxonomy annotation sounds pretty sketchy to me. # However, we may as well test that -- if for example QIIME 2 switches # to CSV instead of TSV -- leading/trailing tabs are still treated as # leading/trailing whitespace and are therefore ignored. funky_fm.loc["f4", "Taxonomy"] = ( " \t " + funky_fm.loc["f4", "Taxonomy"] + "\t" ) split_fm = tax_utils.split_taxonomy(funky_fm) self.check_basic_case_worked(split_fm)
def test_match_inputs_feature_metadata_duplicate_name_internal_node(self): """Tests that feature metadata for internal nodes with duplicate names is preserved. In the JS interface, there are two options for coloring nodes by feature metadata: 1) just coloring tips (and propagating clades with uniform feature metadata upwards), or 2) coloring all nodes with feature metadata, which can include internal nodes. In 2), internal nodes with the same name will have the same feature metadata color. """ # Slightly modified version of self.tree with duplicate internal node # names (i and g) t = parse_newick('(((a:1,e:2)i:1,b:2)g:1,(:1,d:3)g:2)i:1;') fm = self.feature_metadata.copy() fm.index = ["a", "g", "i"] f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs( t, self.table, self.sample_metadata, fm ) assert_frame_equal(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) split_fm = split_taxonomy(fm) # Main point of this test: all of the feature metadata should have been # kept, even though g and i were both duplicate node names. assert_frame_equal(t_fm, split_fm.loc[["a"]]) assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)
def setUp(self): self.tree = self.mock_tree_from_nwk() self.bp_tree = from_skbio_treenode(self.tree) self.table = biom.Table( np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0, 0]]).T, list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.ids())) # (These are some Greengenes taxonomy annotations I took from the # moving pictures taxonomy.qza file. I made up the confidences.) self.feature_metadata = pd.DataFrame( { "Taxonomy": [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__"), ("k__Bacteria; p__Proteobacteria; " "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__"), ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis")], "Confidence": [0.95, 0.8, 0] }, index=["e", "h", "a"]) self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata) self.tip_md = self.split_tax_fm.loc[["a", "e"]] self.int_md = self.split_tax_fm.loc[["h"]] # This is designed to match the shearing that's done in the core test # for --p-shear-to-table self.shorn_tree = parse_newick( "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;") self.exp_split_fm_cols = [ "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6", "Level 7", "Confidence" ] eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3']) samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]] proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame( samples, index=['Sample1', 'Sample2', 'Sample3', 'Sample4'], columns=['PC1', 'PC2', 'PC3']) self.ordination = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained)
def test_split_taxonomy_all_rows_no_semicolons(self): funky_fm = self.feature_metadata.copy() funky_fm.loc["f1", "Taxonomy"] = "Bacteria" funky_fm.loc["f2", "Taxonomy"] = "Archaea" funky_fm.loc["f3", "Taxonomy"] = "Bacteria" funky_fm.loc["f4", "Taxonomy"] = "Viruses" with self.assertWarnsRegex( tax_utils.TaxonomyWarning, ( "None of the taxonomy values in the feature metadata " r"contain a semicolon \(;\). Please make sure your taxonomy " 'is formatted so that "levels" are separated by semicolons.' ) ): split_fm = tax_utils.split_taxonomy(funky_fm) self.assertCountEqual(split_fm.columns, ["Level 1", "Confidence"]) assert_series_equal( split_fm.loc["f1"], pd.Series({"Level 1": "Bacteria", "Confidence": 0.95}, name="f1") ) assert_series_equal( split_fm.loc["f2"], pd.Series({"Level 1": "Archaea", "Confidence": 0.8}, name="f2") ) assert_series_equal( split_fm.loc["f3"], pd.Series({"Level 1": "Bacteria", "Confidence": 0}, name="f3") ) assert_series_equal( split_fm.loc["f4"], pd.Series({"Level 1": "Viruses", "Confidence": 1}, name="f4") )
def match_tree_and_feature_metadata(bp_tree, feature_metadata=None): """Processes feature metadata and subsets it to nodes in the tree. NOTE: This function calls bp_tree_tips() on bp_tree. If this winds up being a bottleneck, we could add an extra optional parameter to this function where match_inputs() could pass the already-computed tip names here to avoid calling this function twice. Parameters ---------- bp_tree: bp.BP The tree to be visualized. feature_metadata: pd.DataFrame or None Feature metadata. If this is passed, the index should describe node names in the tree and the columns should describe different feature metadata fields' names. Returns ------- (tip_metadata, int_metadata): (pd.DataFrame or None, pd.DataFrame or None) If feature metadata was not passed, tip_metadata and int_metadata will both be None. Otherwise, tip_metadata will contain the entries of the feature metadata where the feature name was present as a tip in the tree, and int_metadata will contain the entries of the feature metadata where the feature name was present as internal node(s) in the tree. Also, this will call taxonomy_utils.split_taxonomy() on the feature metadata before splitting it up between tip and internal node feature mtadata. Raises ------ DataMatchingError If feature_metadata is not None, but none of the names in its index correspond to any nodes in the tree. """ tip_metadata = None int_metadata = None if feature_metadata is not None: # Split up taxonomy column, if present in the feature metadata ts_feature_metadata = taxonomy_utils.split_taxonomy(feature_metadata) fm_ids = ts_feature_metadata.index # Subset tip metadata fm_and_tip_features = fm_ids.intersection(bp_tree_tips(bp_tree)) tip_metadata = ts_feature_metadata.loc[fm_and_tip_features] # Subset internal node metadata internal_node_names = set(bp_tree_non_tips(bp_tree)) fm_and_int_features = fm_ids.intersection(internal_node_names) int_metadata = ts_feature_metadata.loc[fm_and_int_features] if len(tip_metadata.index) == 0 and len(int_metadata.index) == 0: # Error condition 5 in match_inputs_community_plot() raise DataMatchingError( "No features in the feature metadata are present in the tree, " "either as tips or as internal nodes." ) return tip_metadata, int_metadata
def test_split_taxonomy_rows_with_no_semicolons(self): funky_fm = self.feature_metadata.copy() funky_fm.loc["f1", "Taxonomy"] = "birds aren't real" funky_fm.loc["f2", "Taxonomy"] = "theyve been drones" funky_fm.loc["f3", "Taxonomy"] = "all along :O" split_fm, taxcols = tax_utils.split_taxonomy(funky_fm) # Notice that f4's taxonomy is still there -- so each feature will have # 3 levels self.assertCountEqual(split_fm.columns, ["Level 1", "Level 2", "Level 3", "Confidence"]) assert_taxcols_ok(taxcols, exp_num_levels=3) # Check each row individually assert_series_equal( split_fm.loc["f1"], pd.Series( { "Level 1": "birds aren't real", "Level 2": "Unspecified", "Level 3": "Unspecified", "Confidence": 0.95 }, name="f1")) assert_series_equal( split_fm.loc["f2"], pd.Series( { "Level 1": "theyve been drones", "Level 2": "Unspecified", "Level 3": "Unspecified", "Confidence": 0.8 }, name="f2")) assert_series_equal( split_fm.loc["f3"], pd.Series( { "Level 1": "all along :O", "Level 2": "Unspecified", "Level 3": "Unspecified", "Confidence": 0 }, name="f3")) assert_series_equal( split_fm.loc["f4"], pd.Series( { "Level 1": "k__Bacteria", "Level 2": "p__Firmicutes", "Level 3": "c__Bacilli", "Confidence": 1 }, name="f4"))
def test_split_taxonomy_basic_case(self): initial_fm = self.feature_metadata.copy() split_fm, taxcols = tax_utils.split_taxonomy(initial_fm) # First off, check that initial_fm was NOT modified: the input DF # should remain untouched assert_frame_equal(self.feature_metadata, initial_fm) # Do all the in-depth testing of split_fm and taxcols in a utility # func (we'll reuse this code a bit later) self._check_basic_case_worked(split_fm, taxcols)
def setUp(self): self.tree = self.mock_tree_from_nwk() self.bp_tree = from_skbio_treenode(self.tree) # Test table/metadata (mostly) adapted from Qurro: # https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/tests/test_df_utils.py#L178 self.table = pd.DataFrame( { "Sample1": [1, 2, 0, 4], "Sample2": [8, 7, 0, 5], "Sample3": [1, 0, 0, 0], "Sample4": [0, 0, 0, 0] }, index=["a", "b", "e", "d"] ) self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.columns)[:] ) # (These are some Greengenes taxonomy annotations I took from the # moving pictures taxonomy.qza file. I made up the confidences.) self.feature_metadata = pd.DataFrame( { "Taxonomy": [ ( "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__" ), ( "k__Bacteria; p__Proteobacteria; " "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__" ), ( "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis" ) ], "Confidence": [0.95, 0.8, 0] }, index=["e", "h", "a"] ) self.split_tax_fm = split_taxonomy(self.feature_metadata) self.exp_split_fm_cols = [ "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6", "Level 7", "Confidence" ]
def test_match_inputs_feature_metadata_only_internal_node_metadata(self): """Tests that feature metadata only for internal nodes is allowed.""" # Slightly modified version of self.tree where root has a name (i) t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;') fm = self.feature_metadata.copy() fm.index = ["h", "g", "i"] f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs( t, self.table, self.sample_metadata, fm) self.assertEqual(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) split_fm = split_taxonomy(fm) # 1) Check that tip metadata is empty self.assertEqual(len(t_fm.index), 0) # 2) Check that internal node metadata was preserved assert_frame_equal(i_fm, split_fm.loc[fm.index], check_like=True) # 3) Check that columns on both DFs are identical self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols) self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)
def test_match_inputs_feature_metadata_root_metadata_allowed(self): """Tests that feature metadata for the root node is preserved.""" # Slightly modified version of self.tree where root has a name (i) t = parse_newick('(((a:1,e:2):1,b:2)g:1,(:1,d:3)h:2)i:1;') fm = self.feature_metadata.copy() fm.index = ["a", "g", "i"] f_table, f_sample_metadata, t_fm, i_fm = tools.match_inputs( t, self.table, self.sample_metadata, fm) # (check that we didn't mess up the table / sample metadata matching by # accident) self.assertEqual(f_table, self.table) assert_frame_equal(f_sample_metadata, self.sample_metadata) split_fm = split_taxonomy(fm) # Main point of this test: all of the feature metadata should have been # kept, since a, g, and i are all included in the tree (i in particular # is important to verify, since it's the root) assert_frame_equal(t_fm, split_fm.loc[["a"]]) assert_frame_equal(i_fm, split_fm.loc[["g", "i"]], check_like=True)
def test_split_taxonomy_level_column_but_no_taxonomy_column(self): meh_fm = self.feature_metadata.copy() meh_fm.columns = ["I'm ambivalent!", "Level 20"] meh_fm2, taxcols = tax_utils.split_taxonomy(meh_fm) assert_frame_equal(meh_fm, meh_fm2) self.assertEqual(taxcols, [])
def test_split_taxonomy_no_tax_column(self): fm2 = self.feature_metadata.copy() fm2.columns = ["asdf", "ghjkl"] fm3, taxcols = tax_utils.split_taxonomy(fm2) assert_frame_equal(fm2, fm3) self.assertEqual(taxcols, [])
def match_inputs(tree, table, sample_metadata, feature_metadata=None, ignore_missing_samples=False, filter_missing_features=False): """Matches various input sources. Also "splits up" the feature metadata, first by calling taxonomy_utils.split_taxonomy() on it and then by splitting the resulting DataFrame into two separate DataFrames (one for tips and one for internal nodes). Parameters ---------- tree: empress.tree.Tree The tree to be visualized. table: pd.DataFrame Representation of the feature table. The index should describe feature IDs; the columns should describe sample IDs. (It's expected that feature IDs in the table only describe tips in the tree, not internal nodes.) sample_metadata: pd.DataFrame Sample metadata. The index should describe sample IDs; the columns should describe different sample metadata fields' names. feature_metadata: pd.DataFrame or None Feature metadata. If this is passed, the index should describe feature IDs and the columns should describe different feature metadata fields' names. (Feature IDs here can describe tips or internal nodes in the tree.) ignore_missing_samples: bool If True, pads missing samples (i.e. samples in the table but not the metadata) with placeholder metadata. If False, raises a DataMatchingError if any such samples exist. (Note that in either case, samples in the metadata but not in the table are filtered out; and if no samples are shared between the table and metadata, a DataMatchingError is raised regardless.) This is analogous to the ignore_missing_samples flag in Emperor. filter_missing_features: bool If True, filters features from the table that aren't present as tips in the tree. If False, raises a DataMatchingError if any such features exist. (Note that in either case, features in the tree but not in the table are preserved.) Returns ------- (table, sample_metadata, tip_metadata, int_metadata): (pd.DataFrame, pd.DataFrame, pd.DataFrame / None, pd.DataFrame / None) Versions of the input table, sample metadata, and feature metadata filtered such that: -The table only contains features also present as tips in the tree. -The sample metadata only contains samples also present in the table. -Samples present in the table but not in the sample metadata will have all of their sample metadata values set to "This sample has no metadata". (This will only be done if ignore_missing_samples is True; otherwise, this situation will trigger an error. See below.) -If feature metadata was not passed, tip_metadata and int_metadata will both be None. Otherwise, tip_metadata will contain the entries of the feature metadata where the feature name was present as a tip in the tree, and int_metadata will contain the entries of the feature metadata where the feature name was present as internal node(s) in the tree. -Also, for sanity's sake, this will call taxonomy_utils.split_taxonomy() on the feature metadata before splitting it up into tip and internal node metadata. Raises ------ DataMatchingError If any of the following conditions are met: 1. No features are shared between the tree's tips and table. 2. There are features present in the table but not as tips in the tree, AND filter_missing_features is False. 3. No samples are shared between the sample metadata and table. 4. There are samples present in the table but not in the sample metadata, AND ignore_missing_samples is False. 5. The feature metadata was passed, but no features present in it are also present as tips or internal nodes in the tree. References ---------- This function was based on match_table_and_data() in Qurro's code: https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L255 """ # Match table and tree. # (Ignore None-named tips in the tree, which will be replaced later on # with "default" names like "EmpressNode0".) tip_names = set([n.name for n in tree.tips() if n.name is not None]) tree_and_table_features = table.index.intersection(tip_names) if len(tree_and_table_features) == 0: # Error condition 1 raise DataMatchingError( "No features in the feature table are present as tips in the tree." ) ff_table = table.copy() if len(tree_and_table_features) < len(table.index): if filter_missing_features: # Filter table to just features that are also present in the tree. # # Note that we *don't* filter the tree analogously, because it's ok # for the tree's nodes to be a superset of the table's features # (and this is going to be the case in most datasets where the # features correspond to tips, since internal nodes aren't # explicitly described in the feature table). ff_table = table.loc[tree_and_table_features] # Report to user about any dropped features from table. dropped_feature_ct = table.shape[0] - ff_table.shape[0] warnings.warn( ("{} feature(s) in the table were not present as tips in " "the tree. These feature(s) have been removed from the " "visualization.").format(dropped_feature_ct), DataMatchingWarning) else: # Error condition 2 raise DataMatchingError( "The feature table contains features that aren't present as " "tips in the tree. You can override this error by using the " "--p-filter-missing-features flag.") # Match table (post-feature-filtering, if done) and sample metadata. table_samples = set(ff_table.columns) sm_samples = set(sample_metadata.index) sm_and_table_samples = sm_samples & table_samples if len(sm_and_table_samples) == 0: # Error condition 3 raise DataMatchingError( "No samples in the feature table are present in the sample " "metadata.") padded_metadata = sample_metadata.copy() if len(sm_and_table_samples) < len(ff_table.columns): if ignore_missing_samples: # Works similarly to how Emperor does this: see # https://github.com/biocore/emperor/blob/659b62a9f02a6423b6258c814d0e83dbfd05220e/emperor/core.py#L350 samples_without_metadata = table_samples - sm_samples padded_metadata = pd.DataFrame(index=samples_without_metadata, columns=sample_metadata.columns, dtype=str) padded_metadata.fillna("This sample has no metadata", inplace=True) sample_metadata = pd.concat([sample_metadata, padded_metadata]) # Report to user about samples we needed to "pad." warnings.warn( ("{} sample(s) in the table were not present in the " "sample metadata. These sample(s) have been assigned " "placeholder metadata.").format( len(samples_without_metadata)), DataMatchingWarning) else: # Error condition 4 raise DataMatchingError( "The feature table contains samples that aren't present in " "the sample metadata. You can override this error by using " "the --p-ignore-missing-samples flag.") # If we've made it this far, then there must be at least *one* sample # present in both the sample metadata and the table: and by this point the # metadata's samples should be a superset of the table's samples (since we # padded the metadata above if there were any samples that *weren't* in the # table). # # All that's left to do is to filter the sample metadata to just the # samples that are also present in the table. sf_sample_metadata = sample_metadata.loc[ff_table.columns] # If desired, we could report here to the user about any dropped samples # from the metadata by looking at the difference between # sample_metadata.shape[0] and sf_sample_metadata.shape[0]. However, the # presence of such "dropped samples" is a common occurrence in 16S studies, # so we currently don't do that for the sake of avoiding alarm fatigue. # If the feature metadata was passed, filter it so that it only contains # features present as tips / internal nodes in the tree tip_metadata = None int_metadata = None if feature_metadata is not None: # Split up taxonomy column, if present in the feature metadata ts_feature_metadata = taxonomy_utils.split_taxonomy(feature_metadata) fm_ids = ts_feature_metadata.index # Subset tip metadata fm_and_tip_features = fm_ids.intersection(tip_names) tip_metadata = ts_feature_metadata.loc[fm_and_tip_features] # Subset internal node metadata internal_node_names = set( [n.name for n in tree.non_tips(include_self=True)]) fm_and_int_features = fm_ids.intersection(internal_node_names) int_metadata = ts_feature_metadata.loc[fm_and_int_features] if len(tip_metadata.index) == 0 and len(int_metadata.index) == 0: # Error condition 5 raise DataMatchingError( "No features in the feature metadata are present in the tree, " "either as tips or as internal nodes.") return ff_table, sf_sample_metadata, tip_metadata, int_metadata