Exemple #1
0
    def test_ndr_over_sample(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_classification_dataset(config, 10)

        result = ndr.NDR(source,
                         working_subset='train',
                         num_cut=10,
                         over_sample='random',
                         seed=12145)

        self.assertEqual(10, len(result.get_subset("train")))
        self.assertEqual(290, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))

        result = ndr.NDR(source,
                         working_subset='train',
                         num_cut=10,
                         over_sample='similarity',
                         seed=12145)

        self.assertEqual(10, len(result.get_subset("train")))
        self.assertEqual(290, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))
Exemple #2
0
    def test_ndr_gradient_fails_on_invalid_parameters(self):
        source = self._generate_dataset({'label1': 5}, 10)

        with self.assertRaisesRegex(ValueError, "Invalid block_shape"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             block_shape=(3, 6, 6),
                             algorithm='gradient')
            len(result)

        with self.assertRaisesRegex(ValueError,
                                    "block_shape should be positive"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             block_shape=(-1, 0),
                             algorithm='gradient')
            len(result)

        with self.assertRaisesRegex(ValueError,
                                    "sim_threshold should be large than 0"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             sim_threshold=0,
                             block_shape=(8, 8),
                             algorithm='gradient')
            len(result)

        with self.assertRaisesRegex(
                ValueError, "hash_dim should be smaller than feature shape"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             hash_dim=1024,
                             block_shape=(8, 8),
                             algorithm='gradient')
            len(result)

        with self.assertRaisesRegex(ValueError, "hash_dim should be positive"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             hash_dim=-5,
                             block_shape=(8, 8),
                             algorithm='gradient')
            len(result)
Exemple #3
0
    def test_ndr_seed(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_classification_dataset(config, 10)

        result1 = ndr.NDR(source, working_subset="train", seed=12345)
        result2 = ndr.NDR(source, working_subset="train", seed=12345)
        result3 = ndr.NDR(source, working_subset="train", seed=12)

        self.assertEqual(tuple(result1.get_subset("train")),
                         tuple(result2.get_subset("train")))
        self.assertNotEqual(tuple(result1.get_subset("train")),
                            tuple(result3.get_subset("train")))
        self.assertNotEqual(tuple(result2.get_subset("train")),
                            tuple(result3.get_subset("train")))
Exemple #4
0
    def test_ndr_without_cut(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_classification_dataset(config, 10)

        result = ndr.NDR(source, working_subset='train', seed=12145)

        self.assertEqual(2, len(result.get_subset("train")))
        self.assertEqual(298, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))
Exemple #5
0
    def test_ndr_with_error(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        with self.assertRaisesRegex(ValueError, "Invalid working_subset name"):
            source = self._generate_classification_dataset(config, 3)
            subset = "no_such_subset"
            result = ndr.NDR(source, working_subset=subset)
            len(result)

        with self.assertRaisesRegex(ValueError,
                                    "working_subset == duplicated_subset"):
            source = self._generate_classification_dataset(config, 3)
            result = ndr.NDR(source,
                             working_subset="train",
                             duplicated_subset="train")
            len(result)

        with self.assertRaisesRegex(ValueError, "Invalid algorithm name"):
            source = self._generate_classification_dataset(config, 3)
            algorithm = "no_such_algo"
            result = ndr.NDR(source,
                             working_subset="train",
                             algorithm=algorithm)
            len(result)

        with self.assertRaisesRegex(
                ValueError,
                "The number of images is smaller than the cut you want"):
            source = self._generate_classification_dataset(config, 3)
            result = ndr.NDR(source, working_subset='train', num_cut=10000)
            len(result)

        with self.assertRaisesRegex(ValueError, "Invalid over_sample"):
            source = self._generate_classification_dataset(config, 10)
            sampling = "no_such_sampling"
            result = ndr.NDR(source,
                             working_subset='train',
                             num_cut=100,
                             seed=12145,
                             over_sample=sampling)
            len(result)

        with self.assertRaisesRegex(ValueError, "Invalid under_sample"):
            source = self._generate_classification_dataset(config, 10)
            sampling = "no_such_sampling"
            result = ndr.NDR(source,
                             working_subset='train',
                             num_cut=1,
                             seed=12145,
                             under_sample=sampling)
            len(result)
Exemple #6
0
    def test_ndr_gradient_can_use_block(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_dataset(config, 10)
        result = ndr.NDR(source,
                         working_subset='train',
                         over_sample='random',
                         block_shape=(8, 8),
                         seed=12145)

        self.assertEqual(1, len(result.get_subset("train")))
        self.assertEqual(299, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))
Exemple #7
0
    def test_ndr_can_use_undersample_inverse(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_dataset(config, 10)

        result = ndr.NDR(source,
                         working_subset='train',
                         num_cut=1,
                         under_sample='inverse',
                         seed=12145)

        self.assertEqual(1, len(result.get_subset("train")))
        self.assertEqual(299, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))
Exemple #8
0
    def test_ndr_gradient_specific(self):
        config = {"label1": 100, "label2": 100, "label3": 100}
        # train : 300, val : 300, test : 300
        np.random.seed(1234)
        source = self._generate_classification_dataset(config, 10)
        with self.assertRaisesRegex(ValueError, "Invalid block_shape"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             block_shape=(3, 6, 6),
                             seed=12145)
            len(result)

        with self.assertRaisesRegex(ValueError,
                                    "block_shape should be positive"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             block_shape=(-1, 0),
                             seed=12145)
            len(result)

        result = ndr.NDR(source,
                         working_subset='train',
                         over_sample='random',
                         block_shape=(8, 8),
                         seed=12145)

        self.assertEqual(1, len(result.get_subset("train")))
        self.assertEqual(299, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))

        with self.assertRaisesRegex(
                ValueError, "hash_dim should be smaller than feature shape"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             hash_dim=1024,
                             block_shape=(8, 8),
                             seed=12145)
            len(result)

        with self.assertRaisesRegex(ValueError, "hash_dim should be positive"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             hash_dim=-5,
                             block_shape=(8, 8),
                             seed=12145)
            len(result)

        result = ndr.NDR(source,
                         working_subset='train',
                         over_sample='random',
                         hash_dim=16,
                         seed=12145)

        self.assertEqual(2, len(result.get_subset("train")))
        self.assertEqual(298, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))

        with self.assertRaisesRegex(ValueError,
                                    "sim_threshold should be large than 0"):
            result = ndr.NDR(source,
                             working_subset='train',
                             over_sample='random',
                             sim_threshold=0,
                             block_shape=(8, 8),
                             seed=12145)
            len(result)

        result = ndr.NDR(source,
                         working_subset='train',
                         over_sample='random',
                         sim_threshold=0.7,
                         seed=12145)

        self.assertEqual(2, len(result.get_subset("train")))
        self.assertEqual(298, len(result.get_subset("duplicated")))
        self.assertEqual(300, len(result.get_subset("val")))
        self.assertEqual(300, len(result.get_subset("test")))
        # Check source
        self.assertEqual(300, len(source.get_subset("train")))
        self.assertEqual(300, len(source.get_subset("val")))
        self.assertEqual(300, len(source.get_subset("test")))