Ejemplo n.º 1
0
    def test_sample_cluster_select_all(self):
        columns = ['context_id', 'n_mismatches', 'locale']
        cluster = pd.DataFrame(
            [['c4', 5, 'cd'], ['c2', 0, 'ab'], ['c99', 1, 'gh']],
            columns=columns)
        obs = _sample_cluster(cluster, 3, np.random.RandomState())
        exp = ['c4', 'c2', 'c99']
        self.assertEqual(set(obs), set(exp))

        obs = _sample_cluster(cluster, 4, np.random.RandomState())
        exp = ['c4', 'c2', 'c99']
        self.assertEqual(set(obs), set(exp))
Ejemplo n.º 2
0
    def test_sample_cluster_missing_locales(self):
        columns = ['context_id', 'n_mismatches', 'locale']
        cluster = pd.DataFrame([['c4', 5, 'abc'],
                                ['c2', 0, float('nan')],
                                ['c99', 1, float('nan')],
                                ['c42', 2, 'abc']],
                               columns=columns)

        count_obs_c4 = 0
        count_obs_c2 = 0
        count_obs_c99 = 0
        count_obs_c42 = 0

        for _ in range(self._N_TEST_ITERATIONS):
            obs = _sample_cluster(cluster, 3, np.random.RandomState())
            self.assertEqual(len(obs), 3)
            if 'c4' in obs:
                count_obs_c4 += 1
            if 'c2' in obs:
                count_obs_c2 += 1
            if 'c99' in obs:
                count_obs_c99 += 1
            if 'c42' in obs:
                count_obs_c42 += 1

        # c4 and c42 all have locale "abc" and c99 and c2 have unknown locale,
        # so we expect to see c99 amd c2 more frequently
        self.assertTrue(count_obs_c99 > count_obs_c4)
        self.assertTrue(count_obs_c99 > count_obs_c42)
        self.assertTrue(count_obs_c2 > count_obs_c4)
        self.assertTrue(count_obs_c2 > count_obs_c42)
Ejemplo n.º 3
0
    def test_sample_cluster_multiple_locales_2_samples_per_cluster(self):
        columns = ['context_id', 'n_mismatches', 'locale']
        cluster = pd.DataFrame([['c4', 5, 'abc'],
                                ['c2', 0, 'abc'],
                                ['c99', 1, 'def'],
                                ['c42', 2, 'abc']],
                               columns=columns)

        count_obs_c4 = 0
        count_obs_c2 = 0
        count_obs_c99 = 0
        count_obs_c42 = 0

        for _ in range(self._N_TEST_ITERATIONS):
            obs = _sample_cluster(cluster, 2, np.random.RandomState())
            self.assertEqual(len(obs), 2)
            if 'c4' in obs:
                count_obs_c4 += 1
            if 'c2' in obs:
                count_obs_c2 += 1
            if 'c99' in obs:
                count_obs_c99 += 1
            if 'c42' in obs:
                count_obs_c42 += 1

        # c4, c2, and c42 all have locale "abc" and c99 has locale "abc",
        # so we expect to see c99 more frequently than any of the other three
        self.assertTrue(count_obs_c99 > count_obs_c4)
        self.assertTrue(count_obs_c99 > count_obs_c2)
        self.assertTrue(count_obs_c99 > count_obs_c42)
Ejemplo n.º 4
0
    def test_sample_cluster_single_locale_2_samples_per_cluster(self):
        columns = ['context_id', 'n_mismatches', 'locale']
        cluster = pd.DataFrame([['c4', 5, 'abc'], ['c2', 0, 'abc'],
                                ['c99', 1, 'abc'], ['c42', 2, 'abc']],
                               columns=columns)

        obs = _sample_cluster(cluster, 2, np.random.RandomState())
        exp = ['c2', 'c4']
        self.assertEqual(set(obs), set(exp))
Ejemplo n.º 5
0
    def test_sample_cluster_no_locale_1_sample_per_cluster(self):
        columns = ['context_id', 'n_mismatches', 'locale']
        cluster = pd.DataFrame([['c4', 5, None], ['c2', 0, None],
                                ['c99', 1, None], ['c42', 2, None]],
                               columns=columns)

        obs = _sample_cluster(cluster, 1, np.random.RandomState())
        exp = ['c2']
        self.assertEqual(set(obs), set(exp))
Ejemplo n.º 6
0
 def test_sample_cluster_empty(self):
     cluster = pd.DataFrame([])
     obs = _sample_cluster(cluster, 1, np.random.RandomState())
     exp = []
     self.assertEqual(obs, exp)