def test_sample_cluster_select_all(self): columns = ['context_id', 'n_mismatches', 'locale'] cluster = pd.DataFrame( [['c4', 5, 'cd'], ['c2', 0, 'ab'], ['c99', 1, 'gh']], columns=columns) obs = _sample_cluster(cluster, 3, np.random.RandomState()) exp = ['c4', 'c2', 'c99'] self.assertEqual(set(obs), set(exp)) obs = _sample_cluster(cluster, 4, np.random.RandomState()) exp = ['c4', 'c2', 'c99'] self.assertEqual(set(obs), set(exp))
def test_sample_cluster_missing_locales(self): columns = ['context_id', 'n_mismatches', 'locale'] cluster = pd.DataFrame([['c4', 5, 'abc'], ['c2', 0, float('nan')], ['c99', 1, float('nan')], ['c42', 2, 'abc']], columns=columns) count_obs_c4 = 0 count_obs_c2 = 0 count_obs_c99 = 0 count_obs_c42 = 0 for _ in range(self._N_TEST_ITERATIONS): obs = _sample_cluster(cluster, 3, np.random.RandomState()) self.assertEqual(len(obs), 3) if 'c4' in obs: count_obs_c4 += 1 if 'c2' in obs: count_obs_c2 += 1 if 'c99' in obs: count_obs_c99 += 1 if 'c42' in obs: count_obs_c42 += 1 # c4 and c42 all have locale "abc" and c99 and c2 have unknown locale, # so we expect to see c99 amd c2 more frequently self.assertTrue(count_obs_c99 > count_obs_c4) self.assertTrue(count_obs_c99 > count_obs_c42) self.assertTrue(count_obs_c2 > count_obs_c4) self.assertTrue(count_obs_c2 > count_obs_c42)
def test_sample_cluster_multiple_locales_2_samples_per_cluster(self): columns = ['context_id', 'n_mismatches', 'locale'] cluster = pd.DataFrame([['c4', 5, 'abc'], ['c2', 0, 'abc'], ['c99', 1, 'def'], ['c42', 2, 'abc']], columns=columns) count_obs_c4 = 0 count_obs_c2 = 0 count_obs_c99 = 0 count_obs_c42 = 0 for _ in range(self._N_TEST_ITERATIONS): obs = _sample_cluster(cluster, 2, np.random.RandomState()) self.assertEqual(len(obs), 2) if 'c4' in obs: count_obs_c4 += 1 if 'c2' in obs: count_obs_c2 += 1 if 'c99' in obs: count_obs_c99 += 1 if 'c42' in obs: count_obs_c42 += 1 # c4, c2, and c42 all have locale "abc" and c99 has locale "abc", # so we expect to see c99 more frequently than any of the other three self.assertTrue(count_obs_c99 > count_obs_c4) self.assertTrue(count_obs_c99 > count_obs_c2) self.assertTrue(count_obs_c99 > count_obs_c42)
def test_sample_cluster_single_locale_2_samples_per_cluster(self): columns = ['context_id', 'n_mismatches', 'locale'] cluster = pd.DataFrame([['c4', 5, 'abc'], ['c2', 0, 'abc'], ['c99', 1, 'abc'], ['c42', 2, 'abc']], columns=columns) obs = _sample_cluster(cluster, 2, np.random.RandomState()) exp = ['c2', 'c4'] self.assertEqual(set(obs), set(exp))
def test_sample_cluster_no_locale_1_sample_per_cluster(self): columns = ['context_id', 'n_mismatches', 'locale'] cluster = pd.DataFrame([['c4', 5, None], ['c2', 0, None], ['c99', 1, None], ['c42', 2, None]], columns=columns) obs = _sample_cluster(cluster, 1, np.random.RandomState()) exp = ['c2'] self.assertEqual(set(obs), set(exp))
def test_sample_cluster_empty(self): cluster = pd.DataFrame([]) obs = _sample_cluster(cluster, 1, np.random.RandomState()) exp = [] self.assertEqual(obs, exp)