def test_ranks_specific(self):
        exp_1 = pd.DataFrame([['c', 'sample-1', 1.], ['g', 'sample-1', 2]],
                             columns=['Taxon', 'Sample ID', 'Rank'])
        exp_2 = pd.DataFrame([['c', 'sample-2', 1.]],
                             columns=['Taxon', 'Sample ID', 'Rank'])
        exp_3 = pd.DataFrame([['c', 'sample-3', 2.], ['g', 'sample-3', 1]],
                             columns=['Taxon', 'Sample ID', 'Rank'])

        taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)

        obs_1 = taxonomy.ranks_specific('sample-1')
        obs_2 = taxonomy.ranks_specific('sample-2')
        obs_3 = taxonomy.ranks_specific('sample-3')

        self._clean_sort_df(obs_1, ['Taxon', 'Sample ID'])
        self._clean_sort_df(obs_2, ['Taxon', 'Sample ID'])
        self._clean_sort_df(obs_3, ['Taxon', 'Sample ID'])

        self._clean_sort_df(exp_1, ['Taxon', 'Sample ID'])
        self._clean_sort_df(exp_2, ['Taxon', 'Sample ID'])
        self._clean_sort_df(exp_3, ['Taxon', 'Sample ID'])

        pdt.assert_frame_equal(obs_1, exp_1, check_like=True)
        pdt.assert_frame_equal(obs_2, exp_2, check_like=True)
        pdt.assert_frame_equal(obs_3, exp_3, check_like=True)
Exemple #2
0
 def test_init_disjoint(self):
     with self.assertRaisesRegex(DisjointError,
                                 "Table and features are disjoint"):
         Taxonomy(self.table, self.taxonomy2_df)
     with self.assertRaisesRegex(DisjointError,
                                 "Table and features are disjoint"):
         Taxonomy(self.table2, self.taxonomy_df)
 def test_get_group(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     exp = GroupTaxonomy(name='sample-2',
                         taxonomy='((((feature-1,((feature-2)e)d)c)b)a);',
                         features=['feature-1', 'feature-2'],
                         feature_values=[1. / 5, 4. / 5],
                         feature_variances=[0.0, 0.0])
     obs = taxonomy.get_group(['sample-2'])
     self.assertEqual(obs, exp)
 def test_get_group_with_variances(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df, self.table_vars)
     exp = GroupTaxonomy(
         name='sample-1',
         taxonomy='((((((feature-2)e)d)c)b,(((feature-3)h)g)f)a);',  # noqa
         features=['feature-2', 'feature-3'],
         feature_values=[2. / 5, 3. / 5],
         feature_variances=[2.0, 3.0])
     obs = taxonomy.get_group(['sample-1'])
     self.assertEqual(obs, exp)
 def test_qza_integration(self):
     table_qza = Artifact.import_data("FeatureTable[Frequency]", self.table)
     taxonomy_qza = Artifact.import_data(
         "FeatureData[Taxonomy]",
         self.taxonomy_df,
     )
     table = table_qza.view(biom.Table)
     taxonomy_df = taxonomy_qza.view(pd.DataFrame)
     taxonomy = Taxonomy(table, taxonomy_df)
     taxonomy.get_group(['sample-1', 'sample-2'], 'foo')
 def test_get_group_multiple(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     exp = GroupTaxonomy(
         name='foo',
         taxonomy=
         '((((feature-1,((feature-2)e)d)c)b,(((feature-3)h)g)f)a);',  # noqa
         features=['feature-1', 'feature-2', 'feature-3'],
         feature_values=[1. / 10, 6. / 10, 3. / 10],
         feature_variances=[0.0, 0.0, 0.0])
     obs = taxonomy.get_group(['sample-1', 'sample-2'], 'foo')
     self.assertEqual(obs.name, exp.name)
     self.assertEqual(obs.taxonomy, exp.taxonomy)
     self.assertEqual(obs.features, exp.features)
     npt.assert_almost_equal(obs.feature_values, exp.feature_values)
     self.assertEqual(obs.feature_variances, exp.feature_variances)
    def test_init_disjoint_variances(self):
        bad = self.table_vars.copy()
        bad.update_ids({'sample-1': 'sample-bad'}, inplace=True, strict=False)

        with self.assertRaisesRegex(DisjointError,
                                    "Table and variances are disjoint"):
            Taxonomy(self.table, self.taxonomy_df, bad)
 def test_bp_tree(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     bp_tree = taxonomy.bp_tree
     exp_parens = [
         1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0
     ]
     obs_parens = list(bp_tree.B)
     self.assertListEqual(exp_parens, obs_parens)
     exp_names = [
         'a',
         'b',
         'c',
         'feature-1',
         'd',
         'e',
         'feature-2',
         'f',
         'g',
         'h',
         'feature-3',
     ]
     obs_names = []
     for i in range(len(bp_tree.B)):
         name = bp_tree.name(i)
         if name is not None:
             obs_names.append(name)
     self.assertListEqual(exp_names, obs_names)
    def test_ranks_order(self):
        taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)

        exp = ['c', 'g']
        obs = taxonomy.ranks_order()
        self.assertEqual(obs, exp)

        exp = ['c', 'g']
        obs = taxonomy.ranks_order(['g', 'c'])
        self.assertEqual(obs, exp)

        exp = ['c']
        obs = taxonomy.ranks_order([
            'c',
        ])
        self.assertEqual(obs, exp)
    def test_rare_unique(self):
        # feature 1 is "rare" for samples 2 and 3 at a theshold of <= 50%
        # feature 3 is "unique" to sample 1
        table = biom.Table(
            np.array([[0, 1, 2, 0], [2, 4, 6, 1],
                      [3, 0, 0, 0]]), ['feature-1', 'feature-2', 'feature-3'],
            ['sample-1', 'sample-2', 'sample-3', 'sample-4'])
        taxonomy_df = pd.DataFrame(
            [['feature-1', 'a; b; c', 0.123],
             ['feature-2', 'a; b; c; d; e', 0.345],
             ['feature-3', 'a; f; g; h', 0.678]],
            columns=['Feature ID', 'Taxon', 'Confidence'])
        taxonomy_df.set_index('Feature ID', inplace=True)
        tax = Taxonomy(table, taxonomy_df)

        exp = {
            'sample-1': {
                'rare': {
                    'feature-3': 0.25
                },
                'unique': [
                    'feature-3',
                ]
            },
            'sample-2': {
                'rare': {
                    'feature-1': 0.5
                },
                'unique': None
            },
            'sample-3': {
                'rare': {
                    'feature-1': 0.5
                },
                'unique': None
            },
            'sample-4': {
                'rare': None,
                'unique': None
            }
        }

        for k, e in exp.items():
            obs = tax.rare_unique(k, rare_threshold=0.51)
            self.assertEqual(obs, e)
 def test_init_variances(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df, self.table_vars)
     self.assertEqual(taxonomy._table, self.table.copy().norm())
     self.assertEqual(taxonomy._variances, self.table_vars)
     pdt.assert_frame_equal(taxonomy._features, self.taxonomy_df)
     self.assertEqual(list(taxonomy._table.ids(axis='observation')),
                      list(taxonomy._features.index))
     self.assertEqual(list(taxonomy._table.ids(axis='observation')),
                      list(taxonomy._variances.ids(axis='observation')))
    def test_ranks_sample(self):
        exp = pd.DataFrame(
            [['c', 'sample-1', 1.], ['c', 'sample-2', 1], ['c', 'sample-3', 2],
             ['g', 'sample-1', 2], ['g', 'sample-3', 1]],
            columns=['Taxon', 'Sample ID', 'Rank'])
        taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)
        obs = taxonomy.ranks_sample(5)
        self._clean_sort_df(obs, ['Taxon', 'Sample ID'])
        self._clean_sort_df(exp, ['Taxon', 'Sample ID'])
        pdt.assert_frame_equal(obs, exp, check_like=True)

        obs = taxonomy.ranks_sample(4)
        self.assertIn(sorted(obs['Taxon'].values),
                      [['c', 'c', 'c', 'g'], ['c', 'c', 'g', 'g']])

        obs = taxonomy.ranks_sample(100)
        self.assertEqual(sorted(obs['Taxon'].values),
                         ['c', 'c', 'c', 'g', 'g'])
    def test_init_rankdata(self):
        exp = pd.DataFrame(
            [['c', 'sample-1', 1.], ['c', 'sample-2', 1], ['c', 'sample-3', 2],
             ['g', 'sample-1', 2], ['g', 'sample-3', 1]],
            columns=['Taxon', 'Sample ID', 'Rank'])

        taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)

        obs = taxonomy._ranked
        self._clean_sort_df(obs, ['Taxon', 'Sample ID'])
        self._clean_sort_df(exp, ['Taxon', 'Sample ID'])
        pdt.assert_frame_equal(obs, exp, check_like=True)
    def test_get_counts(self):
        taxonomy_df = pd.DataFrame(
            [['feature-1', 'k__a; p__b; c__c', 0.123],
             ['feature-2', 'k__a; p__b; c__c; o__d; f__e', 0.345],
             ['feature-3', 'k__a; p__f; c__g; o__h', 0.678]],
            columns=['Feature ID', 'Taxon', 'Confidence'])
        taxonomy_df.set_index('Feature ID', inplace=True)
        taxonomy = Taxonomy(self.table, taxonomy_df)
        expected = [('Kingdom', {'a': 3}), ('Phylum', {'b': 2, 'f': 1})]

        for level, exp in expected:
            obs = taxonomy.get_counts(level)
            self.assertEqual(obs, exp)
            obs = taxonomy.get_counts(
                level, samples=['sample-1', 'sample-2', 'sample-3'])
            self.assertEqual(obs, exp)

        expected_batch = [('sample-1', [('Kingdom', {
            'a': 2
        }), ('Phylum', {
            'b': 1,
            'f': 1
        })]), ('sample-2', [('Kingdom', {
            'a': 2
        }), ('Phylum', {
            'b': 2
        })]),
                          ('sample-3', [('Kingdom', {
                              'a': 3
                          }), ('Phylum', {
                              'b': 2,
                              'f': 1
                          })])]
        for sample, expected in expected_batch:
            for level, exp in expected:
                obs = taxonomy.get_counts(level, samples=sample)
                self.assertEqual(obs, exp)
    def test_ranks_order_unknown(self):
        taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)
        with self.assertRaisesRegex(UnknownID, "foobar"):
            taxonomy.ranks_order([
                "foobar",
            ])

        with self.assertRaisesRegex(UnknownID, "foobar"):
            taxonomy.ranks_order([
                "c",
                "foobar",
            ])
    def test_index_taxa_prevalence(self):
        table = biom.Table(
            np.array([[0, 1, 2, 0], [2, 4, 6, 1],
                      [3, 0, 0, 0]]), ['feature-1', 'feature-2', 'feature-3'],
            ['sample-1', 'sample-2', 'sample-3', 'sample-4'])
        taxonomy_df = pd.DataFrame(
            [['feature-1', 'a; b; c', 0.123],
             ['feature-2', 'a; b; c; d; e', 0.345],
             ['feature-3', 'a; f; g; h', 0.678]],
            columns=['Feature ID', 'Taxon', 'Confidence'])
        taxonomy_df.set_index('Feature ID', inplace=True)
        tax = Taxonomy(table, taxonomy_df)

        exp_unique = pd.Series([False, False, True],
                               index=['feature-1', 'feature-2', 'feature-3'])
        exp_prev = pd.Series([0.5, 1., 0.25],
                             index=['feature-1', 'feature-2', 'feature-3'])
        pdt.assert_series_equal(exp_unique, tax.feature_uniques)
        pdt.assert_series_equal(exp_prev, tax.feature_prevalence)
 def test_init_no_variances(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     self.assertEqual(taxonomy._table, self.table.copy().norm())
     self.assertEqual(taxonomy._variances, self.no_variances)
     pdt.assert_frame_equal(taxonomy._features, self.taxonomy_df)
 def test_get_feature_ids(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     npt.assert_equal(taxonomy._get_feature_ids(),
                      ['feature-1', 'feature-2', 'feature-3'])
 def test_get_sample_ids(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     npt.assert_equal(taxonomy._get_sample_ids(),
                      ['sample-1', 'sample-2', 'sample-3'])
    def test_presence_data_table(self):
        taxonomy = Taxonomy(self.table, self.taxonomy_greengenes_df,
                            self.table_vars)
        obs = taxonomy.presence_data_table(['sample-1', 'sample-2'])

        exp_columns = [
            'sampleId', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family',
            'Genus', 'Species', 'relativeAbundance'
        ]
        DataEntry = create_data_entry(exp_columns)
        exp = DataTable(
            data=[
                DataEntry(
                    **{
                        'sampleId': 'sample-1',
                        'Kingdom': 'a',
                        'Phylum': 'b',
                        'Class': None,
                        'Order': 'c',
                        'Family': 'd',
                        'Genus': 'e',
                        'Species': None,
                        'relativeAbundance': 2. / 5,
                    }),
                DataEntry(
                    **{
                        'sampleId': 'sample-1',
                        'Kingdom': 'a',
                        'Phylum': 'f',
                        'Class': None,
                        'Order': 'g',
                        'Family': 'h',
                        'Genus': None,
                        'Species': None,
                        'relativeAbundance': 3. / 5,
                    }),
                DataEntry(
                    **{
                        'sampleId': 'sample-2',
                        'Kingdom': 'a',
                        'Phylum': 'b',
                        'Class': None,
                        'Order': 'c',
                        'Family': None,
                        'Genus': None,
                        'Species': None,
                        'relativeAbundance': 1. / 5,
                    }),
                DataEntry(
                    **{
                        'sampleId': 'sample-2',
                        'Kingdom': 'a',
                        'Phylum': 'b',
                        'Class': None,
                        'Order': 'c',
                        'Family': 'd',
                        'Genus': 'e',
                        'Species': None,
                        'relativeAbundance': 4. / 5,
                    }),
            ],
            columns=exp_columns,
        )
        self.assertListEqual([{
            'data': col
        } for col in exp.columns], obs.columns)
        # wouldn't want to do this on a huge dataframe..., but it checks if
        #  there is a row of obs corresponding to each row of exp...
        exp_df = pd.DataFrame(exp.data)
        obs_df = pd.DataFrame(obs.data)
        obs_df_copy = obs_df.copy()
        for e_idx, row_exp in exp_df.iterrows():
            for o_idx, row_obs in obs_df.iterrows():
                if row_exp.eq(row_obs).all():
                    obs_df_copy.drop(index=o_idx, inplace=True)
                    break
        self.assertTrue(obs_df_copy.empty)
 def test_ranks_specific_missing_id(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)
     with self.assertRaisesRegex(UnknownID, 'foobar'):
         taxonomy.ranks_specific('foobar')
 def test_init_allow_taxonomy_superset(self):
     Taxonomy(self.table, self.taxonomy_superset_df)
 def test_init_rankdata_order(self):
     exp = ['c', 'g']
     taxonomy = Taxonomy(self.table, self.taxonomy_df, rank_level=2)
     obs = list(taxonomy._ranked_order.index)
     self.assertEqual(obs, exp)
 def test_init_disjoint(self):
     with self.assertRaisesRegex(SubsetError, "not a subset"):
         Taxonomy(self.table, self.taxonomy2_df)
     with self.assertRaisesRegex(SubsetError, "not a subset"):
         Taxonomy(self.table2, self.taxonomy_df)
 def test_get_group_missing(self):
     taxonomy = Taxonomy(self.table, self.taxonomy_df)
     with self.assertRaisesRegex(UnknownID, "sample-X does not exist"):
         taxonomy.get_group(['sample-X'])