def test_n_output_without_augmentation(self): texts = [ 'AAAAAAAAAAA AAAAAAAAAAAAAA' ] flows = [ naf.Sequential([ nac.OcrAug(), nac.OcrAug() ]), naf.Sometimes([ nac.RandomCharAug(), nac.RandomCharAug() ], pipeline_p=0.00001) ] for flow in flows: for text in texts: for _ in range(5): augmented_texts = flow.augment(text, n=3) all_not_equal = False for augmented_text in augmented_texts: if augmented_text != text: all_not_equal = True break if all_not_equal: break self.assertFalse(all_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.5), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ]), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.5) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: at_least_one_not_equal = False for _ in range(0, 5): for text in texts: self.assertLess(0, len(text)) augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def augment_text_ocr(comment): aug = nac.OcrAug(aug_char_p=0.3, aug_word_p=0.4, aug_word_min=len(comment)) try: augmented_texts = aug.augment(comment, n=1) except: augmented_texts = None return augmented_texts
def char_level(text, n): #Augmenting data in character level. aug = nac.OcrAug() attacked_texts = aug.augment(text, n=n) # gives n forms of augmentation (n is the number of augmented forms a user wants) print("Attacked Text:") print(attacked_texts)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ] for flow in flows: for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts)) self.assertLess(0, len(flows))
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') flows = [ naf.Sequential([ naf.Sequential([ nac.OcrAug(), naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ]), naf.Sequential([ nac.RandomCharAug(), ]), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ]), naf.Sometimes([ naf.Sequential([ nac.OcrAug(), nac.RandomCharAug(), ]), naf.Sometimes([ naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ], pipeline_p=0.999), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ], pipeline_p=0.9999) ] for num_thread in [1, 3]: for flow in flows: augmented_data = flow.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.9), naf.Sequential( [ # nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.9) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: for text in texts: at_least_one_not_equal = False for _ in range(5): augmented_text = flow.augment(text, n=1) if text != augmented_text: at_least_one_not_equal = True break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def augment_text_ocr(comment): """ OCRAug adds noise to a comment by replacing the target characters with predefined mapping table """ aug = nac.OcrAug(aug_char_p=0.3, aug_word_p=0.4, aug_word_min=len(comment)) try: augmented_texts = aug.augment(comment, n=1) except: augmented_texts = None return augmented_texts
def nlpaug(word): aug = naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete"), naw.SpellingAug(), ]) word = aug.augment(word) return word
def __init__(self, template, output_file, augmentation_factor=5): assert augmentation_factor >= 2 self.augmentation_factor = augmentation_factor self.base_file = template self.output_file = output_file self.dataset = {} self.intents = {} self.character_augmenter = nac.OcrAug() self.word_augmenter = naw.ContextualWordEmbsAug()
def test_n_output_without_augmentation(self): texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA'] flows = [ naf.Sequential([nac.OcrAug(), nac.OcrAug()]), naf.Sometimes( [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001) ] for flow in flows: for text in texts: at_least_one_equal = False for _ in range(5): augmented_texts = flow.augment(text, n=3) if len(augmented_texts ) == 1 and augmented_texts[0] == text: at_least_one_equal = True break self.assertTrue(at_least_one_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_empty(self): texts = ['', None] augs = [ nac.OcrAug(), nac.KeyboardAug(), ] for text in texts: for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_special_char(self): text = '#' aug = nac.KeyboardAug(min_char=1) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) # No mapping, return original value text = '~' augs = [nac.KeyboardAug(min_char=1), nac.OcrAug(min_char=1)] for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_stopwords_regex(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps " augs = [ nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex), nac.KeyboardAug(stopwords_regex=stopwords_regex), nac.OcrAug(stopwords_regex=stopwords_regex) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_stopwords(self): text = 'The quick brown fox jumps over the lazy dog.' stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog'] augs = [ nac.RandomCharAug(stopwords=stopwords), nac.KeyboardAug(stopwords=stopwords), nac.OcrAug(stopwords=stopwords) ] for aug in augs: for i in range(10): augmented_text = aug.augment(text) self.assertTrue( 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
def test_empty_input_for_substitute(self): texts = ['', ' '] augs = [ nac.RandomCharAug(action='substitute'), nac.KeyboardAug(), nac.OcrAug() ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.5), naf.Sequential([ nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ], include_detail=True), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=1, include_detail=True) ] for flow in flows: augmented_text, augment_details = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall())
def test_min_char(self): text = 'He eats apple' augs = [ nac.RandomCharAug(min_char=5), nac.KeyboardAug(min_char=5), nac.OcrAug(min_char=5) ] for aug in augs: augmented = False for i in range(10): augmented_text = aug.augment(text) if 'apple' not in augmented_text: augmented = True break self.assertTrue(augmented)
def test_tokenizer(self): augs = [ nac.OcrAug(tokenizer=text_tokenizer.split_sentence), nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence), nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence), ] text = 'The quick brown fox, jumps over lazy dog.' expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens) text = 'The quick !brown fox, jumps # over lazy dog .' expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .'] for aug in augs: tokens = aug.tokenizer(text) self.assertEqual(tokens, expected_tokens)
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' augs = [ nac.KeyboardAug(min_char=1, include_detail=True), nac.OcrAug(min_char=1, include_detail=True), nac.RandomCharAug(min_char=2, include_detail=True) ] for aug in augs: augmented_text, augment_details = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertTrue(augment_detail['orig_token'] in text) self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall()) # Get back original input by re-engineering reengineering_text = augmented_text for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True): if change_obj['action'] == Action.DELETE: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = change_obj['orig_token'] + ' ' text_suffix = reengineering_text[change_obj['new_start_pos']:] elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]: text_prefix = reengineering_text[:change_obj['new_start_pos']] text_core = reengineering_text[change_obj['new_start_pos']:].replace( change_obj['new_token'], change_obj['orig_token'], 1) text_suffix = '' # TODO # elif change_obj['action'] in Action.SWAP: reengineering_text = text_prefix + text_core + text_suffix reengineering_text = reengineering_text.strip() self.assertEqual(text, reengineering_text)
def ocr_aug(corpus): aug = nac.OcrAug(tokenizer=whitespace_tokenizer) # go through all train and dev sentences augmented_sentences = [] for sentence in corpus.train: augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3) for augmented_text in augmented_texts: augmented_sentence: Sentence = Sentence() augmented_token_texts = augmented_text.split(" ") for augmented_token_text, original_token in zip(augmented_token_texts, sentence): # make a new token augmented_token = Token(augmented_token_text) # transfer annotations over to augmented token augmented_token.annotation_layers = original_token.annotation_layers # add augmented token to augmented sentence augmented_sentence.add_token(augmented_token) # add augmented sentence to list of all augmented sentences augmented_sentences.append(augmented_sentence) corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def augment_image( dataset: list, augmenter, ): """ a generic augment process on generator :param dataset: dataset generator (batch) :param augmenter: :return: """ for data_point in dataset: data, label = data_point if augmenter is not None: data = augmenter(images=data) yield np.asarray(data), np.asarray(label) # NULL augmenter none_augmenter = partial(augment_text, augmenter=None) # OCR error augmenter text_ocr_augmenter = partial(augment_text, augmenter=nac.OcrAug()) # flip_r image augmenter image_flip_r_augmenter = partial(augment_image, augmenter=iaa.Fliplr()) # rotation image augmenter image_rot_augmenter = partial(augment_image, augmenter=iaa.Rotate())
import re import attr import random import pandas as pd from faker import Faker from .utility import formating from .CONSTATNTS import * import nlpaug.augmenter.char as nac fake = Faker() aug = nac.OcrAug() @attr.s class DataGeneration: template_list = attr.ib() number = attr.ib() @property def data(self): return_data = [] const_data = self.constant_generation() for i in range(0, self.number): temp, class_3_label = random.choice(self.template_list) class_2_label = class_3_label if class_3_label == 'partial-addressline': class_2_label = 'addressline' matcher = re.findall('\{.*?\}', temp) for i in matcher: key = i.replace('{', '').replace('}', '') tp = random.choice(const_data[key])
def augment_dataset(input_df): """ Augmenting the dataset based on NLP aug library. If the dataset is small, this is a great way to boost things up. But you do not want to apply augmentation on the Doctor's response. These should not have any spelling mistakes. - The augmentation that will be done here is character level augmentations and word level augmentations: - OCR error augmentation (character level) - Keyboard augmentation (character level) - Synonym augmenter (word level) """ print('Augmenting the dataset based on Synonyms...') ocr = nac.OcrAug() response_OCR = [] context_OCR = [] keyboard = nac.KeyboardAug() response_keyboard = [] context_keyboard = [] synonym = naw.SynonymAug(aug_src='wordnet') response_synonym = [] context_synonym = [] for i in input_df.index: if i % 10 == 0: print('processing {}th line'.format(i)) response = input_df['response'][i] context = input_df['context'][i] #augmentation ocr_augmented_line = ocr.augment(context, n=3) response_OCR.append(response) context_OCR.append(ocr_augmented_line) #keyboard augmentation keyboard_augmented_line = keyboard.augment(context) response_keyboard.append(response) context_keyboard.append(keyboard_augmented_line) #synonym augmentation synonym_augmented_line = synonym.augment(context) response_synonym.append(response) context_synonym.append(synonym_augmented_line) ocr_augmented_data = {'response': response_OCR, 'context': context_OCR} ocr_df = pd.DataFrame.from_dict(ocr_augmented_data) keyboard_augmented_data = { 'response': response_keyboard, 'context': context_keyboard } keyboard_df = pd.DataFrame.from_dict(keyboard_augmented_data) synonym_augmented_data = { 'response': response_synonym, 'context': context_synonym } synonym_df = pd.DataFrame.from_dict(synonym_augmented_data) augmented_1 = input_df.append(ocr_df, ignore_index=True) augmented_2 = augmented_1.append(keyboard_df, ignore_index=True) augmented_3 = augmented_2.append(synonym_df, ignore_index=True) print('original dataset length: {}'.format(len(input_df))) print('Augmented dataset length: {}'.format(len(augmented_2))) return augmented_3