def test_sampler_number_of_samples(self): config = { "label1": 10, "label2": 10, "label3": 10, } source = self._generate_classification_dataset(config) num_pre_train_subset = len(source.get_subset("train")) with self.subTest("k > num of data with top-k"): num_sample = 500 sampling_method = "topk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k > num of data with low-k"): num_sample = 500 sampling_method = "lowk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k > num of data with rand-k"): num_sample = 500 sampling_method = "randk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k > num of data with mix-k"): num_sample = 500 sampling_method = "mixk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k > num of data with randtop-k"): num_sample = 500 sampling_method = "randtopk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k == num of data with top-k"): num_sample = 10 sampling_method = "topk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k == num of data with low-k"): num_sample = 10 sampling_method = "lowk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k == num of data with rand-k"): num_sample = 10 sampling_method = "randk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k == num of data with mix-k"): num_sample = 10 sampling_method = "mixk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) with self.subTest("k == num of data with randtop-k"): num_sample = 10 sampling_method = "randtopk" result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) num_sample = 9 result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method=sampling_method, count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample")), 9)
def test_sampler_unaccumulated_sampling(self): config = { "label1": 10, "label2": 10, "label3": 10, } source = self._generate_classification_dataset(config) num_pre_train_subset = len(source.get_subset("train")) num_pre_val_subset = len(source.get_subset("val")) num_pre_test_subset = len(source.get_subset("test")) with self.subTest("Same Subset, Same number of datas 3times"): num_sample = 3 result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample1", unsampled_subset="train", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - num_sample) result = RelevancySampler( result, algorithm="entropy", input_subset="train", sampled_subset="sample2", unsampled_subset="train", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("sample2")), num_sample) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - num_sample * 2) result = RelevancySampler( result, algorithm="entropy", input_subset="train", sampled_subset="sample3", unsampled_subset="train", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("sample2")), num_sample) self.assertEqual(len(result.get_subset("sample3")), num_sample) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - num_sample * 3) with self.subTest("Same Subset, 2, 3, 4 sampling"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample1", unsampled_subset="train", sampling_method="topk", count=2, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) result = RelevancySampler( result, algorithm="entropy", input_subset="train", sampled_subset="sample2", unsampled_subset="train", sampling_method="topk", count=3, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("sample2")), 3) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 5) result = RelevancySampler( result, algorithm="entropy", input_subset="train", sampled_subset="sample3", unsampled_subset="train", sampling_method="topk", count=4, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("sample2")), 3) self.assertEqual(len(result.get_subset("sample3")), 4) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 9) with self.subTest("Different Subset, Same number of datas 3times"): num_sample = 3 result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample1", unsampled_subset="train", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - num_sample) result = RelevancySampler( result, algorithm="entropy", input_subset="val", sampled_subset="sample2", unsampled_subset="val", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("sample2")), num_sample) self.assertEqual(len(result.get_subset("val")), num_pre_val_subset - num_sample) result = RelevancySampler( result, algorithm="entropy", input_subset="test", sampled_subset="sample3", unsampled_subset="test", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), num_sample) self.assertEqual(len(result.get_subset("sample2")), num_sample) self.assertEqual(len(result.get_subset("sample3")), num_sample) self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - num_sample) with self.subTest("Different Subset, 2, 3, 4 sampling"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample1", unsampled_subset="train", sampling_method="topk", count=2, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) result = RelevancySampler( result, algorithm="entropy", input_subset="val", sampled_subset="sample2", unsampled_subset="val", sampling_method="topk", count=3, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("sample2")), 3) self.assertEqual(len(result.get_subset("val")), num_pre_val_subset - 3) result = RelevancySampler( result, algorithm="entropy", input_subset="test", sampled_subset="sample3", unsampled_subset="test", sampling_method="topk", count=4, output_file=None, ) self.assertEqual(len(result.get_subset("sample1")), 2) self.assertEqual(len(result.get_subset("sample2")), 3) self.assertEqual(len(result.get_subset("sample3")), 4) self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - 4)
def test_sampler_get_sample_classification(self): config = { "label1": 10, "label2": 10, "label3": 10, } source = self._generate_classification_dataset(config, ["train"]) num_pre_train_subset = len(source.get_subset("train")) num_sample = 5 with self.subTest("Top-K method"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="topk", count=num_sample, output_file=None, ) self.assertEqual(num_sample, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), ) topk_expected_result = [1, 4, 9, 10, 26] topk_result = list(map(int, result.result["ImageID"].to_list())) self.assertEqual(sorted(topk_result), topk_expected_result) with self.subTest("Low-K method"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="lowk", count=num_sample, output_file=None, ) self.assertEqual(num_sample, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), ) lowk_expected_result = [2, 6, 14, 21, 23] lowk_result = list(map(int, result.result["ImageID"].to_list())) self.assertEqual(sorted(lowk_result), lowk_expected_result) with self.subTest("Rand-K method"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="randk", count=num_sample, output_file=None, ) self.assertEqual(num_sample, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), ) with self.subTest("Mix-K method"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="mixk", count=num_sample, output_file=None, ) self.assertEqual(num_sample, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), ) mixk_expected_result = [2, 4, 10, 23, 26] mixk_result = list(map(int, result.result["ImageID"].to_list())) self.assertEqual(sorted(mixk_result), mixk_expected_result) result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="mixk", count=6, output_file=None, ) self.assertEqual(6, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), ) mixk_expected_result = [2, 4, 6, 10, 23, 26] mixk_result = list(map(int, result.result["ImageID"].to_list())) self.assertEqual(sorted(mixk_result), mixk_expected_result) with self.subTest("Randtop-K method"): result = RelevancySampler( source, algorithm="entropy", input_subset="train", sampled_subset="sample", unsampled_subset="unsampled", sampling_method="randtopk", count=num_sample, output_file=None, ) self.assertEqual(num_sample, len(result.get_subset("sample"))) self.assertEqual( len(result.get_subset("unsampled")), num_pre_train_subset - len(result.get_subset("sample")), )