コード例 #1
0
    def test_sampler_number_of_samples(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config)
        num_pre_train_subset = len(source.get_subset("train"))

        with self.subTest("k > num of data with top-k"):
            num_sample = 500
            sampling_method = "topk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with low-k"):
            num_sample = 500
            sampling_method = "lowk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with rand-k"):
            num_sample = 500
            sampling_method = "randk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with mix-k"):
            num_sample = 500
            sampling_method = "mixk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with randtop-k"):
            num_sample = 500
            sampling_method = "randtopk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with top-k"):
            num_sample = 10
            sampling_method = "topk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with low-k"):
            num_sample = 10
            sampling_method = "lowk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with rand-k"):
            num_sample = 10
            sampling_method = "randk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with mix-k"):
            num_sample = 10
            sampling_method = "mixk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with randtop-k"):
            num_sample = 10
            sampling_method = "randtopk"

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

            num_sample = 9

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(len(result.get_subset("sample")), 9)
コード例 #2
0
    def test_sampler_unaccumulated_sampling(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config)

        num_pre_train_subset = len(source.get_subset("train"))
        num_pre_val_subset = len(source.get_subset("val"))
        num_pre_test_subset = len(source.get_subset("test"))

        with self.subTest("Same Subset, Same number of datas 3times"):
            num_sample = 3

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample2",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample * 2)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample3",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("sample3")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample * 3)

        with self.subTest("Same Subset, 2, 3, 4 sampling"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=2,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 2)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample2",
                unsampled_subset="train",
                sampling_method="topk",
                count=3,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 5)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample3",
                unsampled_subset="train",
                sampling_method="topk",
                count=4,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("sample3")), 4)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 9)

        with self.subTest("Different Subset, Same number of datas 3times"):
            num_sample = 3

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="val",
                sampled_subset="sample2",
                unsampled_subset="val",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("val")),
                             num_pre_val_subset - num_sample)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="test",
                sampled_subset="sample3",
                unsampled_subset="test",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("sample3")), num_sample)
            self.assertEqual(len(result.get_subset("test")),
                             num_pre_test_subset - num_sample)

        with self.subTest("Different Subset, 2, 3, 4 sampling"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=2,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 2)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="val",
                sampled_subset="sample2",
                unsampled_subset="val",
                sampling_method="topk",
                count=3,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("val")),
                             num_pre_val_subset - 3)

            result = RelevancySampler(
                result,
                algorithm="entropy",
                input_subset="test",
                sampled_subset="sample3",
                unsampled_subset="test",
                sampling_method="topk",
                count=4,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("sample3")), 4)
            self.assertEqual(len(result.get_subset("test")),
                             num_pre_test_subset - 4)
コード例 #3
0
    def test_sampler_get_sample_classification(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config, ["train"])
        num_pre_train_subset = len(source.get_subset("train"))

        num_sample = 5

        with self.subTest("Top-K method"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            topk_expected_result = [1, 4, 9, 10, 26]
            topk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(topk_result), topk_expected_result)

        with self.subTest("Low-K method"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="lowk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            lowk_expected_result = [2, 6, 14, 21, 23]
            lowk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(lowk_result), lowk_expected_result)

        with self.subTest("Rand-K method"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="randk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )

        with self.subTest("Mix-K method"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="mixk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            mixk_expected_result = [2, 4, 10, 23, 26]
            mixk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(mixk_result), mixk_expected_result)

            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="mixk",
                count=6,
                output_file=None,
            )
            self.assertEqual(6, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            mixk_expected_result = [2, 4, 6, 10, 23, 26]
            mixk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(mixk_result), mixk_expected_result)

        with self.subTest("Randtop-K method"):
            result = RelevancySampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="randtopk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )