コード例 #1
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_unaccumulated_sampling(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config)

        num_pre_train_subset = len(source.get_subset("train"))
        num_pre_val_subset = len(source.get_subset("val"))
        num_pre_test_subset = len(source.get_subset("test"))

        with self.subTest("Same Subset, Same number of datas 3times"):
            num_sample = 3

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample2",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample * 2)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample3",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("sample3")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample * 3)

        with self.subTest("Same Subset, 2, 3, 4 sampling"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=2,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 2)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample2",
                unsampled_subset="train",
                sampling_method="topk",
                count=3,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 5)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample3",
                unsampled_subset="train",
                sampling_method="topk",
                count=4,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("sample3")), 4)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 9)

        with self.subTest("Different Subset, Same number of datas 3times"):
            num_sample = 3

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - num_sample)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="val",
                sampled_subset="sample2",
                unsampled_subset="val",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("val")),
                             num_pre_val_subset - num_sample)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="test",
                sampled_subset="sample3",
                unsampled_subset="test",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), num_sample)
            self.assertEqual(len(result.get_subset("sample2")), num_sample)
            self.assertEqual(len(result.get_subset("sample3")), num_sample)
            self.assertEqual(len(result.get_subset("test")),
                             num_pre_test_subset - num_sample)

        with self.subTest("Different Subset, 2, 3, 4 sampling"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample1",
                unsampled_subset="train",
                sampling_method="topk",
                count=2,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("train")),
                             num_pre_train_subset - 2)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="val",
                sampled_subset="sample2",
                unsampled_subset="val",
                sampling_method="topk",
                count=3,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("val")),
                             num_pre_val_subset - 3)

            result = Sampler(
                result,
                algorithm="entropy",
                input_subset="test",
                sampled_subset="sample3",
                unsampled_subset="test",
                sampling_method="topk",
                count=4,
                output_file=None,
            )

            self.assertEqual(len(result.get_subset("sample1")), 2)
            self.assertEqual(len(result.get_subset("sample2")), 3)
            self.assertEqual(len(result.get_subset("sample3")), 4)
            self.assertEqual(len(result.get_subset("test")),
                             num_pre_test_subset - 4)
コード例 #2
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_get_sample_classification(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config, ["train"])
        num_pre_train_subset = len(source.get_subset("train"))

        num_sample = 5

        with self.subTest("Top-K method"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="topk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            topk_expected_result = [1, 4, 9, 10, 26]
            topk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(topk_result), topk_expected_result)

        with self.subTest("Low-K method"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="lowk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            lowk_expected_result = [2, 6, 14, 21, 23]
            lowk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(lowk_result), lowk_expected_result)

        with self.subTest("Rand-K method"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="randk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )

        with self.subTest("Mix-K method"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="mixk",
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            mixk_expected_result = [2, 4, 10, 23, 26]
            mixk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(mixk_result), mixk_expected_result)

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="mixk",
                count=6,
                output_file=None,
            )
            self.assertEqual(6, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
            mixk_expected_result = [2, 4, 6, 10, 23, 26]
            mixk_result = list(map(int, result.result["ImageID"].to_list()))
            self.assertEqual(sorted(mixk_result), mixk_expected_result)

        with self.subTest("Randtop-K method"):
            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method="randtopk",
                count=num_sample,
                output_file=None,
            )

            self.assertEqual(num_sample, len(result.get_subset("sample")))
            self.assertEqual(
                len(result.get_subset("unsampled")),
                num_pre_train_subset - len(result.get_subset("sample")),
            )
コード例 #3
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_number_of_samples(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }

        source = self._generate_classification_dataset(config)
        num_pre_train_subset = len(source.get_subset("train"))

        with self.subTest("k > num of data with top-k"):
            num_sample = 500
            sampling_method = "topk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with low-k"):
            num_sample = 500
            sampling_method = "lowk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with rand-k"):
            num_sample = 500
            sampling_method = "randk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with mix-k"):
            num_sample = 500
            sampling_method = "mixk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k > num of data with randtop-k"):
            num_sample = 500
            sampling_method = "randtopk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with top-k"):
            num_sample = 10
            sampling_method = "topk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with low-k"):
            num_sample = 10
            sampling_method = "lowk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with rand-k"):
            num_sample = 10
            sampling_method = "randk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with mix-k"):
            num_sample = 10
            sampling_method = "mixk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

        with self.subTest("k == num of data with randtop-k"):
            num_sample = 10
            sampling_method = "randtopk"

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(num_pre_train_subset,
                             len(result.get_subset("sample")))

            num_sample = 9

            result = Sampler(
                source,
                algorithm="entropy",
                input_subset="train",
                sampled_subset="sample",
                unsampled_subset="unsampled",
                sampling_method=sampling_method,
                count=num_sample,
                output_file=None,
            )
            self.assertEqual(len(result.get_subset("sample")), 9)
コード例 #4
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_get_invalid_data(self):
        with self.subTest("empty dataset"):
            config = {
                "label1": 0,
                "label2": 0,
                "label3": 0,
            }

            source = self._generate_classification_dataset(config)
            with self.assertRaisesRegex(Exception, "Unknown subset"):
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=5,
                    output_file=None,
                )
                result = iter(result)
                next(result)

        with self.subTest("Dataset without Scores (Probability)"):
            config = {
                "label1": 10,
                "label2": 10,
                "label3": 10,
            }

            source = self._generate_classification_dataset(config,
                                                           empty_scores=True)
            with self.assertRaisesRegex(Exception, "ClassProbability"):
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=5,
                    output_file=None,
                )
                result = iter(result)
                next(result)

        with self.subTest(
                "Out of range, probability (Less than 0 or more than 1)"):
            config = {
                "label1": 10,
                "label2": 10,
                "label3": 10,
            }

            source = self._generate_classification_dataset(config,
                                                           empty_scores=False,
                                                           out_range=True)
            with self.assertRaisesRegex(Exception, "Invalid data"):
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=5,
                    output_file=None,
                )
                result = iter(result)
                next(result)

        with self.subTest("No Scores Attribute Data"):
            config = {
                "label1": 10,
                "label2": 10,
                "label3": 10,
            }

            source = self._generate_classification_dataset(config,
                                                           no_attr=True)
            with self.assertRaisesRegex(Exception, "does not have 'scores'"):
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=5,
                    output_file=None,
                )
                result = iter(result)
                next(result)

        with self.subTest("No Image Data"):
            config = {
                "label1": 10,
                "label2": 10,
                "label3": 10,
            }

            source = self._generate_classification_dataset(config, no_img=True)
            with self.assertRaisesRegex(Exception, "does not have image info"):
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=5,
                    output_file=None,
                )
                result = iter(result)
                next(result)
コード例 #5
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_gives_error(self):
        config = {
            "label1": 10,
            "label2": 10,
            "label3": 10,
        }
        num_sample = 5

        source = self._generate_classification_dataset(config)

        with self.subTest("Not found"):
            with self.assertRaisesRegex(Exception, "Unknown subset"):
                subset = "hello"
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset=subset,
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=num_sample,
                    output_file=None,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(Exception, "Unknown algorithm"):
                algorithm = "hello"
                result = Sampler(
                    source,
                    algorithm=algorithm,
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=num_sample,
                    output_file=None,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(Exception, "Unknown sampling method"):
                sampling_method = "hello"
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method=sampling_method,
                    count=num_sample,
                    output_file=None,
                )
                result = iter(result)
                next(result)

        with self.subTest("Invalid Value"):
            with self.assertRaisesRegex(Exception, "Invalid value"):
                k = 0
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=k,
                    output_file=None,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(Exception, "Invalid value"):
                k = -1
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=k,
                    output_file=None,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(Exception, "Invalid value"):
                k = "string"
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=k,
                    output_file=None,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(Exception, "extension"):
                output_file = "string.xml"
                result = Sampler(
                    source,
                    algorithm="entropy",
                    input_subset="train",
                    sampled_subset="sample",
                    unsampled_subset="unsampled",
                    sampling_method="topk",
                    count=num_sample,
                    output_file=output_file,
                )
                result = iter(result)
                next(result)

            with self.assertRaisesRegex(
                    Exception, "Invalid Data, ImageID not found in data"):
                sub = source.get_subset("train")

                data_df = defaultdict(list)
                infer_df = defaultdict(list)

                for data in sub:
                    width, height = data.image.size
                    data_df["Width"].append(width)
                    data_df["Height"].append(height)
                    data_df["ImagePath"].append(data.image.path)

                    for annotation in data.annotations:
                        probs = annotation.attributes["scores"]
                        infer_df["ImageID"].append(data.id)

                        for prob_idx, prob in enumerate(probs):
                            infer_df[f"ClassProbability{prob_idx+1}"].append(
                                prob)

                data_df = pd.DataFrame(data_df)
                infer_df = pd.DataFrame(infer_df)

                entropy(data_df, infer_df)

            with self.assertRaisesRegex(
                    Exception, "Invalid Data, ImageID not found in inference"):
                sub = source.get_subset("train")

                data_df = defaultdict(list)
                infer_df = defaultdict(list)

                for data in sub:
                    width, height = data.image.size
                    data_df["ImageID"].append(data.id)
                    data_df["Width"].append(width)
                    data_df["Height"].append(height)
                    data_df["ImagePath"].append(data.image.path)

                    for annotation in data.annotations:
                        probs = annotation.attributes["scores"]

                        for prob_idx, prob in enumerate(probs):
                            infer_df[f"ClassProbability{prob_idx+1}"].append(
                                prob)

                data_df = pd.DataFrame(data_df)
                infer_df = pd.DataFrame(infer_df)

                entropy(data_df, infer_df)
コード例 #6
0
ファイル: test_sampler.py プロジェクト: Eric2370/datumaro-1
    def test_sampler_parser(self):
        from argparse import ArgumentParser

        assert isinstance(Sampler.build_cmdline_parser(), ArgumentParser)