Exemple #1
0
    def test_subsample_neighbors_locale_w_seed(self):
        exp_metadata = self.context_md1

        # since we're setting a random seed, the result we get the first
        # time is our expected every time
        exp_sel = subsample_neighbors(self.focal_seqs1,
                                      self.context_seqs1,
                                      percent_id=0.98,
                                      samples_per_cluster=2,
                                      locale=self.context_md1.get_column('x'),
                                      seed=0)
        self.assertTrue(exp_sel.inclusion['c1'])
        self.assertEqual(exp_sel.inclusion.sum(), 3)
        self.assertEqual(exp_sel.metadata, exp_metadata)

        for _ in range(self._N_TEST_ITERATIONS):
            sel = subsample_neighbors(self.focal_seqs1,
                                      self.context_seqs1,
                                      percent_id=0.98,
                                      samples_per_cluster=2,
                                      locale=self.context_md1.get_column('x'),
                                      seed=0)

            pdt.assert_series_equal(sel.inclusion,
                                    exp_sel.inclusion)
Exemple #2
0
    def test_subsample_neighbors_metadata_subset(self):
        context_md = self.get_data_path('context-metadata-1-missing-id.tsv')
        context_md = qiime2.Metadata.load(context_md)

        with self.assertRaisesRegex(ValueError, 'not present in the metadata'):
            subsample_neighbors(self.focal_seqs1,
                                self.context_seqs1,
                                percent_id=0.98,
                                samples_per_cluster=1,
                                locale=context_md.get_column('x'))
Exemple #3
0
    def test_subsample_neighbors_no_locale_alt_samples_per_cluster(self):
        sel = subsample_neighbors(self.focal_seqs1,
                                  self.context_seqs1,
                                  percent_id=0.98,
                                  samples_per_cluster=3)

        exp_inclusion = pd.Series([True, True, True, False, True, False],
                                  index=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'],
                                  name='inclusion')
        exp_metadata = pd.DataFrame(index=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])
        exp_metadata.index.name = 'id'
        exp_metadata = qiime2.Metadata(exp_metadata)

        pdt.assert_series_equal(sel.inclusion, exp_inclusion)
        self.assertEqual(sel.metadata, exp_metadata)
        self.assertEqual(sel.label, 'subsample_neighbors')
Exemple #4
0
    def test_subsample_neighbors_terminal_gaps_ignored(self):
        sel = subsample_neighbors(self.focal_seqs2,
                                  self.context_seqs2,
                                  percent_id=1.0,
                                  samples_per_cluster=2)

        exp_inclusion = pd.Series([True],
                                  index=['c1'],
                                  name='inclusion')
        exp_metadata = pd.DataFrame(index=['c1'])
        exp_metadata.index.name = 'id'
        exp_metadata = qiime2.Metadata(exp_metadata)

        pdt.assert_series_equal(sel.inclusion, exp_inclusion)
        self.assertEqual(sel.metadata, exp_metadata)
        self.assertEqual(sel.label, 'subsample_neighbors')
Exemple #5
0
    def test_subsample_neighbors_metadata_superset(self):
        context_md = self.get_data_path('context-metadata-2-extra-ids.tsv')
        context_md = qiime2.Metadata.load(context_md)

        sel = subsample_neighbors(self.focal_seqs2,
                                  self.context_seqs2,
                                  percent_id=1.0,
                                  samples_per_cluster=2,
                                  locale=context_md.get_column('x'))

        exp_inclusion = pd.Series([True],
                                  index=['c1'],
                                  name='inclusion')
        exp_metadata = context_md.filter_ids(['c1'])

        pdt.assert_series_equal(sel.inclusion, exp_inclusion)
        self.assertEqual(sel.metadata, exp_metadata)
        self.assertEqual(sel.label, 'subsample_neighbors')
Exemple #6
0
    def test_subsample_neighbors_locale(self):
        count_obs_c2 = 0
        count_obs_c3 = 0
        count_obs_c4 = 0
        count_obs_c5 = 0

        exp_metadata = self.context_md1.to_dataframe()
        exp_metadata.index.name = 'id'
        exp_metadata = qiime2.Metadata(exp_metadata)

        for _ in range(self._N_TEST_ITERATIONS):
            sel = subsample_neighbors(self.focal_seqs1,
                                      self.context_seqs1,
                                      percent_id=0.98,
                                      samples_per_cluster=2,
                                      locale=self.context_md1.get_column('x'))

            obs_sampled_context_seqs = sel.inclusion[sel.inclusion].keys()
            self.assertTrue('c1' in set(obs_sampled_context_seqs))
            self.assertEqual(sel.inclusion.sum(), 3)
            self.assertEqual(len(sel.inclusion), 6)

            self.assertEqual(sel.metadata, exp_metadata)
            self.assertEqual(sel.label, 'subsample_neighbors')

            if 'c2' in obs_sampled_context_seqs:
                count_obs_c2 += 1
            if 'c3' in obs_sampled_context_seqs:
                count_obs_c3 += 1
            if 'c4' in obs_sampled_context_seqs:
                count_obs_c4 += 1
            if 'c5' in obs_sampled_context_seqs:
                count_obs_c5 += 1

        # since c2, c3, and c5 all have locale "def" and c4 has locale "hijk",
        # so we expect to see c4 more frequently than any of the other three
        self.assertTrue(count_obs_c4 > count_obs_c2)
        self.assertTrue(count_obs_c4 > count_obs_c3)
        self.assertTrue(count_obs_c4 > count_obs_c5)
Exemple #7
0
 def test_subsample_neighbors_invalid_max_accepts(self):
     with self.assertRaisesRegex(ValueError, 'obtained per cluster'):
         subsample_neighbors(self.focal_seqs1,
                             self.context_seqs1,
                             percent_id=0.98,
                             samples_per_cluster=11)