Exemple #1
0
    def test_special_char(self):
        text = '#'
        aug = nac.KeyboardAug(min_char=1)
        augmented_text = aug.augment(text)
        self.assertNotEqual(text, augmented_text)

        # No mapping, return original value
        text = '~'
        augs = [nac.KeyboardAug(min_char=1), nac.OcrAug(min_char=1)]
        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
Exemple #2
0
 def test_load_custom_model_fail(self):
     try:
         aug = nac.KeyboardAug(
             model_path='test_load_custom_model_fail.json')
         self.assertTrue(False)
     except ValueError:
         self.assertTrue(True)
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sequential([
                nac.OcrAug(),
                nac.KeyboardAug(aug_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))
Exemple #4
0
def word_augmentation(df: pd.DataFrame, n: int):
    """Perform data augmentation with KeyboardAug, by inserting random keyboard inputs

    Args:
        df (DataFrame): dataset
        n (int): number of insertions for each row -> change to probability to insert one

    Returns:
        (DataFrame): the updated dataset with new rows
    """
    aug = nac.KeyboardAug()
    aug_df = pd.DataFrame(columns=['input', 'target', 'code'])

    for i in range(len(df)):
        words = df.iloc[i]
        for j in range(n):
            augmented_data = aug.augment(words["input"])
            aug_df = aug_df.append(
                {
                    "input": augmented_data,
                    "target": words["target"],
                    "code": words["code"]
                },
                ignore_index=True)

    return df.append(aug_df)
Exemple #5
0
    def test_no_aug(self):
        aug = nac.KeyboardAug(aug_word_min=0.0, aug_word_p=0.05)
        text = '| 4 ||  || ½ || 0 || ½ || - || 1 || 1 || 1 || 0 || 0 || 0 || 1 || 1 || 1 || 1 || 1 || 1 || 10 || 67.75'

        augmented_data = aug.augment(text)
        self.assertEqual(text.replace(' ', ''),
                         augmented_data.replace(' ', ''))
Exemple #6
0
    def test_multi_words(self):
        texts = ['The quick brown fox jumps over the lazy dog']
        aug = nac.KeyboardAug()
        for text in texts:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)

        self.assertTrue(len(texts) > 0)
Exemple #7
0
    def test_single_word(self):
        texts = ['Zoology', 'roku123456']
        aug = nac.KeyboardAug()
        for text in texts:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)

        self.assertTrue(len(texts) > 0)
Exemple #8
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.9),
                naf.Sequential(
                    [
                        # nac.OcrAug(), nac.QwertyAug(aug_min=1),
                        nac.RandomCharAug(action="substitute",
                                          aug_char_min=1,
                                          aug_char_p=0.6,
                                          aug_word_p=0.6)
                    ],
                    name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.9)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            for text in texts:
                at_least_one_not_equal = False
                for _ in range(5):
                    augmented_text = flow.augment(text, n=1)

                    if text != augmented_text:
                        at_least_one_not_equal = True
                        break

                self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemple #9
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        n = 3
        augs = [
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
def nlpaug(word):
    aug = naf.Sometimes([
        nac.OcrAug(),
        nac.KeyboardAug(),
        nac.RandomCharAug(action="insert"),
        nac.RandomCharAug(action="substitute"),
        nac.RandomCharAug(action="swap"),
        nac.RandomCharAug(action="delete"),
        naw.SpellingAug(),
    ])
    word = aug.augment(word)
    return word
Exemple #11
0
    def test_empty(self):
        texts = ['', None]

        augs = [
            nac.OcrAug(),
            nac.KeyboardAug(),
        ]

        for text in texts:
            for aug in augs:
                augmented_text = aug.augment(text)
                self.assertEqual(text, augmented_text)
Exemple #12
0
    def test_lang_it(self):
        text = 'llllllllllllllllll lllllll'
        aug = nac.KeyboardAug(lang='it')

        augmented = False
        # make sure it convert to at least one of the DE char
        for _ in range(10):
            augmented_text = aug.augment(text)
            if 'ò' in augmented_text or 'ç' in augmented_text :
                augmented = True
                self.assertNotEqual(text, augmented_text)

        self.assertTrue(augmented)
Exemple #13
0
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            nac.RandomCharAug(stopwords=stopwords),
            nac.KeyboardAug(stopwords=stopwords),
            nac.OcrAug(stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
Exemple #14
0
    def test_stopwords_regex(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

        augs = [
            nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex),
            nac.KeyboardAug(stopwords_regex=stopwords_regex),
            nac.OcrAug(stopwords_regex=stopwords_regex)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
Exemple #15
0
    def test_empty_input_for_substitute(self):
        texts = ['', '           ']
        augs = [
            nac.RandomCharAug(action='substitute'),
            nac.KeyboardAug(),
            nac.OcrAug()
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')
Exemple #16
0
    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ],
                           include_detail=True),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=1,
                          include_detail=True)
        ]

        for flow in flows:
            augmented_text, augment_details = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())
Exemple #17
0
    def __init__(self, root_path, seq_length, embed_dim, embed_vec_space):
        """Initialize the dataset"""

        self.root_path = root_path
        self.seq_length = seq_length
        self.tokenized_data = self.tokenize(root_path)
        self.embed = BPEmb(lang="en", vs=embed_vec_space, add_pad_emb=True)
        self.pad = 1002
        self.sos = 1001
        self.eos = 1000

        self.augmentator = nac.KeyboardAug(aug_char_min=0,
                                           aug_char_p=0.4,
                                           aug_word_p=0.5,
                                           aug_word_min=0,
                                           aug_word_max=self.seq_length // 5,
                                           special_char=False)
Exemple #18
0
    def test_min_char(self):
        text = 'He eats apple'
        augs = [
            nac.RandomCharAug(min_char=5),
            nac.KeyboardAug(min_char=5),
            nac.OcrAug(min_char=5)
        ]

        for aug in augs:
            augmented = False
            for i in range(10):
                augmented_text = aug.augment(text)
                if 'apple' not in augmented_text:
                    augmented = True
                    break

            self.assertTrue(augmented)
Exemple #19
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sometimes([
                nac.RandomCharAug(action=Action.INSERT),
                nac.RandomCharAug(action=Action.INSERT),
                nac.RandomCharAug(action=Action.DELETE)
            ],
                          aug_p=0.8),
            naf.Sometimes([
                nac.OcrAug(),
                nac.KeyboardAug(aug_char_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_char_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6),
                nac.RandomCharAug(action=Action.INSERT),
                nac.RandomCharAug(action=Action.DELETE)
            ],
                          aug_p=0.6)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True

                    self.assertLess(0, len(text))

                if at_least_one_not_equal:
                    break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
Exemple #20
0
    def test_tokenizer(self):
        augs = [
            nac.OcrAug(tokenizer=text_tokenizer.split_sentence),
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        text = 'The quick brown fox, jumps over lazy dog.'
        expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)

        text = 'The quick !brown fox, jumps # over lazy dog .'
        expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)
Exemple #21
0
    def test_custom_model(self):
        custom_model = {
            'a': '1',
            'b': '2',
        }

        custom_model_file_path = 'char_keyboard_custom_model.json'

        with open(custom_model_file_path, 'w') as outfile:
            json.dump(custom_model, outfile)

        text = 'ababab'
        aug = nac.KeyboardAug(model_path=custom_model_file_path)
        augmented_text = aug.augment(text)

        self.assertTrue('1' in augmented_text or '2' in augmented_text)

        if os.path.exists(custom_model_file_path):
            os.remove(custom_model_file_path)
Exemple #22
0
    def test_multi_inputs(self):
        texts = [
            'The quick brown fox jumps over the lazy dog.',
            'The quick brown fox jumps over the lazy dog.',
            'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )',
            'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )'
        ]
        augs = [
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        num_thread = 2
        for aug in augs:
            augmented_data = aug.augment(texts, num_thread=num_thread)
            self.assertEqual(len(augmented_data), len(texts))

        num_thread = 1
        for aug in augs:
            augmented_data = aug.augment(texts, num_thread=num_thread)
            self.assertEqual(len(augmented_data), len(texts))
Exemple #23
0
    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'
        augs = [
            nac.KeyboardAug(min_char=1, include_detail=True),
            nac.OcrAug(min_char=1, include_detail=True),
            nac.RandomCharAug(min_char=2, include_detail=True)
        ]

        for aug in augs:
            augmented_text, augment_details = aug.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertTrue(augment_detail['orig_token'] in text)
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())

            # Get back original input by re-engineering
            reengineering_text = augmented_text
            for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True):
                if change_obj['action'] == Action.DELETE:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = change_obj['orig_token'] + ' '
                    text_suffix = reengineering_text[change_obj['new_start_pos']:]

                elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = reengineering_text[change_obj['new_start_pos']:].replace(
                        change_obj['new_token'], change_obj['orig_token'], 1)
                    text_suffix = ''
                # TODO
                # elif change_obj['action'] in Action.SWAP:

                reengineering_text = text_prefix + text_core + text_suffix
                reengineering_text = reengineering_text.strip()

            self.assertEqual(text, reengineering_text)
Exemple #24
0
    def __init__(self, root_path, seq_length, predefined=False):
        """Initialize the dataset"""
        if not predefined:
            self.chars = None
            self.root_path = root_path
            self.seq_length = seq_length
            self.tokenized_data = self.tokenize(root_path)
            self.tokenized_data = self.tokenized_data[:len(
                self.tokenized_data) - len(self.tokenized_data) %
                                                      self.seq_length]
            self.data = np.array(self.tokenized_data).reshape(-1, seq_length)

            self.int2char = dict(enumerate(self.chars))
            self.char2int = {ch: ii for ii, ch in self.int2char.items()}

            self.words = tuple(set(self.tokenized_data))
            self.int2word = dict(enumerate(self.words))
            self.word2int = {ch: ii for ii, ch in self.int2word.items()}
            self.augmentator = nac.KeyboardAug(aug_char_min=0,
                                               aug_char_p=0.4,
                                               aug_word_p=0.4,
                                               aug_word_min=0,
                                               aug_word_max=int(
                                                   0.7 * self.seq_length),
                                               special_char=False,
                                               tokenizer=lambda x: x.split(),
                                               reverse_tokenizer=lambda x: x)

        else:
            self.int2char = predefined["int2char"]
            self.char2int = predefined["char2int"]

            self.words = predefined["words"]
            self.int2word = predefined["int2word"]
            self.word2int = predefined["word2int"]
            self.chars = predefined["chars"]
def keyboard_aug(corpus):
    aug = nac.KeyboardAug(tokenizer=whitespace_tokenizer)
    # go through all train and dev sentences
    augmented_sentences = []
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3)
        for augmented_text in augmented_texts:
            augmented_sentence: Sentence = Sentence()
            augmented_token_texts = augmented_text.split(" ")
            for augmented_token_text, original_token in zip(augmented_token_texts, sentence):
                # make a new token
                augmented_token = Token(augmented_token_text)
                # transfer annotations over to augmented token
                augmented_token.annotation_layers = original_token.annotation_layers
                # add augmented token to augmented sentence
                augmented_sentence.add_token(augmented_token)
            # add augmented sentence to list of all augmented sentences
            augmented_sentences.append(augmented_sentence)

    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)

    return corpus
Exemple #26
0
 def test_lang_uk(self):
     text = 'планувалося провести'
     aug = nac.KeyboardAug(lang='uk')
     augmented_text = aug.augment(text)
     self.assertNotEqual(text, augmented_text)
Exemple #27
0
 def test_non_support_lang(self):
     try:
         nac.KeyboardAug(lang='non_exist')
         self.assertTrue(False)
     except ValueError:
         self.assertTrue(True)
Exemple #28
0
 def test_lang_he(self):
     text = 'את המערכה בתנופה'
     aug = nac.KeyboardAug(lang='he')
     augmented_text = aug.augment(text)
     self.assertNotEqual(text, augmented_text)
Exemple #29
0
 def test_no_special_character(self):
     text = 'qwertyuioplmnbvcxza'
     for i in range(10):
         aug = nac.KeyboardAug(special_char=False)
         augmented_text = aug.augment(text)
         self.assertTrue(re.match("^[a-zA-Z0-9]*$", augmented_text))
Exemple #30
0
 def test_lang_nl(self):
     text = 'jjjjjjjjjjjjjjjjjjjjjjjjj jjjjjjjj'
     aug = nac.KeyboardAug(lang='nl')
     augmented_text = aug.augment(text)
     self.assertNotEqual(text, augmented_text)