コード例 #1
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.5)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True

                    self.assertLess(0, len(text))

                if at_least_one_not_equal:
                    break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
コード例 #2
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sequential([
                nac.OcrAug(),
                nac.QwertyAug(aug_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))
コード例 #3
0
    def test_empty(self):
        texts = ['', None]

        augs = [
            nac.OcrAug(),
            nac.QwertyAug(),
        ]

        for text in texts:
            for aug in augs:
                augmented_text = aug.augment(text)
                self.assertEqual(text, augmented_text)
コード例 #4
0
    def test_tokenizer(self):
        augs = [
            nac.OcrAug(tokenizer=text_tokenizer.split_sentence),
            nac.QwertyAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        text = 'The quick brown fox, jumps over lazy dog.'
        expected_tokens = [
            'The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy',
            ' dog', '.'
        ]
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)

        text = 'The quick !brown fox, jumps # over lazy dog .'
        expected_tokens = [
            'The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ',
            'over', ' lazy', ' dog', ' .'
        ]
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)