def test_special_char(self): text = '#' aug = nac.KeyboardAug(min_char=1) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) # No mapping, return original value text = '~' augs = [nac.KeyboardAug(min_char=1), nac.OcrAug(min_char=1)] for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_load_custom_model_fail(self): try: aug = nac.KeyboardAug( model_path='test_load_custom_model_fail.json') self.assertTrue(False) except ValueError: self.assertTrue(True)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ] for flow in flows: for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts)) self.assertLess(0, len(flows))
def word_augmentation(df: pd.DataFrame, n: int): """Perform data augmentation with KeyboardAug, by inserting random keyboard inputs Args: df (DataFrame): dataset n (int): number of insertions for each row -> change to probability to insert one Returns: (DataFrame): the updated dataset with new rows """ aug = nac.KeyboardAug() aug_df = pd.DataFrame(columns=['input', 'target', 'code']) for i in range(len(df)): words = df.iloc[i] for j in range(n): augmented_data = aug.augment(words["input"]) aug_df = aug_df.append( { "input": augmented_data, "target": words["target"], "code": words["code"] }, ignore_index=True) return df.append(aug_df)
def test_no_aug(self): aug = nac.KeyboardAug(aug_word_min=0.0, aug_word_p=0.05) text = '| 4 || || ½ || 0 || ½ || - || 1 || 1 || 1 || 0 || 0 || 0 || 1 || 1 || 1 || 1 || 1 || 1 || 10 || 67.75' augmented_data = aug.augment(text) self.assertEqual(text.replace(' ', ''), augmented_data.replace(' ', ''))
def test_multi_words(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = nac.KeyboardAug() for text in texts: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertTrue(len(texts) > 0)
def test_single_word(self): texts = ['Zoology', 'roku123456'] aug = nac.KeyboardAug() for text in texts: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertTrue(len(texts) > 0)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.9), naf.Sequential( [ # nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.9) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: for text in texts: at_least_one_not_equal = False for _ in range(5): augmented_text = flow.augment(text, n=1) if text != augmented_text: at_least_one_not_equal = True break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog.' n = 3 augs = [ nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] for num_thread in [1, 3]: for aug in augs: augmented_data = aug.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def nlpaug(word): aug = naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete"), naw.SpellingAug(), ]) word = aug.augment(word) return word
def test_empty(self): texts = ['', None] augs = [ nac.OcrAug(), nac.KeyboardAug(), ] for text in texts: for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_lang_it(self): text = 'llllllllllllllllll lllllll' aug = nac.KeyboardAug(lang='it') augmented = False # make sure it convert to at least one of the DE char for _ in range(10): augmented_text = aug.augment(text) if 'ò' in augmented_text or 'ç' in augmented_text : augmented = True self.assertNotEqual(text, augmented_text) self.assertTrue(augmented)
def test_stopwords(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog'] augs = [ nac.RandomCharAug(stopwords=stopwords), nac.KeyboardAug(stopwords=stopwords), nac.OcrAug(stopwords=stopwords) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_stopwords_regex(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps " augs = [ nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex), nac.KeyboardAug(stopwords_regex=stopwords_regex), nac.OcrAug(stopwords_regex=stopwords_regex) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_empty_input_for_substitute(self): texts = ['', ' '] augs = [ nac.RandomCharAug(action='substitute'), nac.KeyboardAug(), nac.OcrAug() ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.5), naf.Sequential([ nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ], include_detail=True), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=1, include_detail=True) ] for flow in flows: augmented_text, augment_details = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall())
def __init__(self, root_path, seq_length, embed_dim, embed_vec_space): """Initialize the dataset""" self.root_path = root_path self.seq_length = seq_length self.tokenized_data = self.tokenize(root_path) self.embed = BPEmb(lang="en", vs=embed_vec_space, add_pad_emb=True) self.pad = 1002 self.sos = 1001 self.eos = 1000 self.augmentator = nac.KeyboardAug(aug_char_min=0, aug_char_p=0.4, aug_word_p=0.5, aug_word_min=0, aug_word_max=self.seq_length // 5, special_char=False)
def test_min_char(self): text = 'He eats apple' augs = [ nac.RandomCharAug(min_char=5), nac.KeyboardAug(min_char=5), nac.OcrAug(min_char=5) ] for aug in augs: augmented = False for i in range(10): augmented_text = aug.augment(text) if 'apple' not in augmented_text: augmented = True break self.assertTrue(augmented)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], aug_p=0.8), naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6), nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], aug_p=0.6) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: at_least_one_not_equal = False for _ in range(0, 5): for text in texts: self.assertLess(0, len(text)) augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_tokenizer(self): augs = [ nac.OcrAug(tokenizer=text_tokenizer.split_sentence), nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] text = 'The quick brown fox, jumps over lazy dog.' expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens) text = 'The quick !brown fox, jumps # over lazy dog .' expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens)
def test_custom_model(self): custom_model = { 'a': '1', 'b': '2', } custom_model_file_path = 'char_keyboard_custom_model.json' with open(custom_model_file_path, 'w') as outfile: json.dump(custom_model, outfile) text = 'ababab' aug = nac.KeyboardAug(model_path=custom_model_file_path) augmented_text = aug.augment(text) self.assertTrue('1' in augmented_text or '2' in augmented_text) if os.path.exists(custom_model_file_path): os.remove(custom_model_file_path)
def test_multi_inputs(self): texts = [ 'The quick brown fox jumps over the lazy dog.', 'The quick brown fox jumps over the lazy dog.', 'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )', 'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )' ] augs = [ nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] num_thread = 2 for aug in augs: augmented_data = aug.augment(texts, num_thread=num_thread) self.assertEqual(len(augmented_data), len(texts)) num_thread = 1 for aug in augs: augmented_data = aug.augment(texts, num_thread=num_thread) self.assertEqual(len(augmented_data), len(texts))
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' augs = [ nac.KeyboardAug(min_char=1, include_detail=True), nac.OcrAug(min_char=1, include_detail=True), nac.RandomCharAug(min_char=2, include_detail=True) ] for aug in augs: augmented_text, augment_details = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertTrue(augment_detail['orig_token'] in text) self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall()) # Get back original input by re-engineering reengineering_text = augmented_text for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True): if change_obj['action'] == Action.DELETE: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = change_obj['orig_token'] + ' ' text_suffix = reengineering_text[change_obj['new_start_pos']:] elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = reengineering_text[change_obj['new_start_pos']:].replace( change_obj['new_token'], change_obj['orig_token'], 1) text_suffix = '' # TODO # elif change_obj['action'] in Action.SWAP: reengineering_text = text_prefix + text_core + text_suffix reengineering_text = reengineering_text.strip() self.assertEqual(text, reengineering_text)
def __init__(self, root_path, seq_length, predefined=False): """Initialize the dataset""" if not predefined: self.chars = None self.root_path = root_path self.seq_length = seq_length self.tokenized_data = self.tokenize(root_path) self.tokenized_data = self.tokenized_data[:len( self.tokenized_data) - len(self.tokenized_data) % self.seq_length] self.data = np.array(self.tokenized_data).reshape(-1, seq_length) self.int2char = dict(enumerate(self.chars)) self.char2int = {ch: ii for ii, ch in self.int2char.items()} self.words = tuple(set(self.tokenized_data)) self.int2word = dict(enumerate(self.words)) self.word2int = {ch: ii for ii, ch in self.int2word.items()} self.augmentator = nac.KeyboardAug(aug_char_min=0, aug_char_p=0.4, aug_word_p=0.4, aug_word_min=0, aug_word_max=int( 0.7 * self.seq_length), special_char=False, tokenizer=lambda x: x.split(), reverse_tokenizer=lambda x: x) else: self.int2char = predefined["int2char"] self.char2int = predefined["char2int"] self.words = predefined["words"] self.int2word = predefined["int2word"] self.word2int = predefined["word2int"] self.chars = predefined["chars"]
def keyboard_aug(corpus): aug = nac.KeyboardAug(tokenizer=whitespace_tokenizer) # go through all train and dev sentences augmented_sentences = [] for sentence in corpus.train: augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3) for augmented_text in augmented_texts: augmented_sentence: Sentence = Sentence() augmented_token_texts = augmented_text.split(" ") for augmented_token_text, original_token in zip(augmented_token_texts, sentence): # make a new token augmented_token = Token(augmented_token_text) # transfer annotations over to augmented token augmented_token.annotation_layers = original_token.annotation_layers # add augmented token to augmented sentence augmented_sentence.add_token(augmented_token) # add augmented sentence to list of all augmented sentences augmented_sentences.append(augmented_sentence) corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def test_lang_uk(self): text = 'планувалося провести' aug = nac.KeyboardAug(lang='uk') augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)
def test_non_support_lang(self): try: nac.KeyboardAug(lang='non_exist') self.assertTrue(False) except ValueError: self.assertTrue(True)
def test_lang_he(self): text = 'את המערכה בתנופה' aug = nac.KeyboardAug(lang='he') augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)
def test_no_special_character(self): text = 'qwertyuioplmnbvcxza' for i in range(10): aug = nac.KeyboardAug(special_char=False) augmented_text = aug.augment(text) self.assertTrue(re.match("^[a-zA-Z0-9]*$", augmented_text))
def test_lang_nl(self): text = 'jjjjjjjjjjjjjjjjjjjjjjjjj jjjjjjjj' aug = nac.KeyboardAug(lang='nl') augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)