def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) data = 'I love McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) data = 'He loves McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 2, 1, 1).get_augmented_tokens() self.assertEqual(['He', 'McDonalds', 'loves'], augmented_tokens) # Insert aug = naw.TfIdfAug(model_path=self.tfidf_model_path, action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug() self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def augmentation(text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def data_augment(corpus, label): syn_aug = naw.SynonymAug(aug_src="wordnet") rand_aug = naw.RandomWordAug(action="swap") data_struc = {'emotion_label': [], 'emotion_text': []} aug_dataframe = pd.DataFrame(data_struc) print('Augmenting data') for label, sentence in zip(label, corpus): if sentence.find("\n") > 0: sentence = sentence.replace("\n", "") aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': sentence }, ignore_index=True) augmented_sent = syn_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent }, ignore_index=True) augmented_sent1 = rand_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent1 }, ignore_index=True) else: aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': sentence }, ignore_index=True) augmented_sent = syn_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent }, ignore_index=True) aug1 = naw.RandomWordAug(action="swap") augmented_sent1 = rand_aug.augment(sentence) aug_dataframe = aug_dataframe.append( { 'emotion_label': label, 'emotion_text': augmented_sent1 }, ignore_index=True) print('Augmentation Completed') return aug_dataframe['emotion_text'], aug_dataframe['emotion_label']
def test_empty_input_for_delete(self): text = ' ' augs = [ naw.RandomWordAug(action="delete"), naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the']) ] for aug in augs: augmented_text = aug.augment(text) # FIXME: standardize return is_equal = augmented_text == '' or augmented_text == ' ' self.assertTrue(is_equal)
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10): nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('omw') flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()]) train_afert_exp=[] dev_after_exp=[] for idx, row in dataset.iterrows(): logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"])) new_text = [new for new in flow.augment(row["question"], n=expansion)] train_afert_exp.append({"label": row["question_id"], "text": row["question"]}) th=int(len(new_text)*0.8) for text in new_text[:th]: train_afert_exp.append({"label": row["question_id"], "text": text}) for text in new_text[th:]: dev_after_exp.append({"label": row["question_id"], "text": text}) train=train_afert_exp dev=dev_after_exp train = pd.DataFrame(train).sample(frac=1.0) dev = pd.DataFrame(dev).sample(frac=1.0) return train, dev
def main(config): infile = Path(config.infile) if not infile.is_file(): raise FileNotFoundError pattern = re.compile('(train|test|val|dev).txt') phase = pattern.findall(infile.name)[0] fin = codecs.open(infile, 'r', 'utf-8') txt = fin.read() fin.close() outdir = Path(config.outdir) if not outdir.is_dir(): outdir.mkdir(parents=True, exist_ok=True) outfile = outdir / f'randswap_aug_{phase}.txt' fout = codecs.open(outfile, 'w', 'utf-8') aug = naw.RandomWordAug(action='swap') lines = [] for line in txt.split('\n'): lines.append(line) for _ in range(config.num_swaps): augmented_text = aug.augment(line) lines.append(augmented_text) fout.writelines(f"{line}\n" for line in lines) fout.close()
def test_swap(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.RandomWordAug(action="swap") for text in texts: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)
def test_empty_input_substitute(self): texts = ['', ' '] self.word2vec_model.action = 'substitute' self.context_word_embs_model.action = 'substitute' augs = [ naw.SpellingAug(), naw.AntonymAug(), naw.RandomWordAug(action='substitute'), naw.SynonymAug(aug_src='wordnet'), naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute"), self.word2vec_model, self.context_word_embs_model ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_swap(self): texts = [ 'The quick brown fox jumps over the lazy dog' ] aug = naw.RandomWordAug(action="swap") for text in texts: tokens = text.lower().split(' ') orig_token_freq = {} for w in tokens: orig_token_freq[w] = tokens.count(w) augmented_text = text # https://github.com/makcedward/nlpaug/issues/77 for i in range(10): augmented_text = aug.augment(augmented_text) aug_tokens = augmented_text.lower().split(' ') aug_token_freq = {} for w in tokens: aug_token_freq[w] = aug_tokens.count(w) for orig_token, orig_freq in orig_token_freq.items(): self.assertTrue(orig_token in aug_token_freq) self.assertTrue(aug_token_freq[orig_token] == orig_freq) self.assertNotEqual(text, augmented_text)
def random_word_swap(text): # Random Word Augmenter # Swap word randomly aug = naw.RandomWordAug(action="swap") attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
def random_deletion(text, p=ALPHA): """ Randomly remove each word in the sentence with probability p=0.05 """ aug = naw.RandomWordAug(action='delete', aug_p=p) augmented_text = aug.augment(text) return augmented_text
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ] for flow in flows: for text in texts: augmented_text = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(text)) self.assertLess(0, len(texts)) self.assertLess(0, len(flows))
def augment_by_class(df, max_n): word_index = {} for phrase in df['Body']: words = phrase.split(' ') for word in words: word_index[word] = word_index.get(word, 0) + 1 index_df = pd.DataFrame([{ 'token': i, 'count': word_index[i] } for i in word_index]) index_df = index_df[index_df['count'] >= 10] aug = naw.SynonymAug(stopwords=index_df['token']) aug2 = naw.RandomWordAug() factor = (max_n // len(df)) + 2 result = set() for phrase in df['Body']: result.add(phrase) print(f'Augmenting for {phrase}') for item in aug.augment(phrase, n=factor): result.add(item) for item in aug2.augment(phrase, n=2): result.add(item) return list(result)
def random_swap_helper(text): """ Randomly choose two words in the sentence and swap their positions. """ aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1) augmented_text = aug.augment(text) return augmented_text
def augment(lines, params): """ Contextual WordEmbs Augmentation with nlpaug for a list of String Args: lines: (List of Strings) params: (Dictionary) aug_max arguments Returns: (List of Strings) new strings """ # Contextual WordEmbs Augumentation pipline aug = naf.Sequential( [ ContextualWordEmbsAug(action=Action.INSERT, aug_max=params['contextual_max']), ContextualWordEmbsAug(action=Action.SUBSTITUTE, aug_max=params['contextual_max']), naw.RandomWordAug(aug_max=params['ramdom_max']) ] ) augmented = [] total_batchs = len(lines) // 100 + 1 for i in range(total_batchs): if i % 100 == 0: logger.info("Augmenting the {} th batch, {}%".format(i, round(i / total_batchs * 100))) if i == total_batchs - 1: sub_line = lines[100 * i:] else: sub_line = lines[100 * i: 100 * (i + 1)] sub_aug = aug.augment(sub_line, num_thread=mp.cpu_count() - 1) augmented = augmented + sub_aug return augmented
def test_delete_one_token(self): texts = ['The'] aug = naw.RandomWordAug(action='delete') for text in texts: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0)) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 0, 1)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 1, 0)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1)) self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1)) # Insert aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def test_empty(self): texts = ['', None, []] aug = naw.RandomWordAug() for text in texts: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_empty_input_for_delete(self): texts = ['', ' ', None] augs = [ naw.RandomWordAug(action="delete"), naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the']) ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_empty_input_for_swap(self): texts = [' '] aug = naw.RandomWordAug(action=Action.SWAP) for text in texts: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text) self.assertEqual(1, len(texts)) tokens = [None] aug = naw.RandomWordAug(action=Action.SWAP) for t in tokens: augmented_text = aug.augment(t) self.assertEqual(augmented_text, None) self.assertEqual(len(tokens), 1)
def augmentation(self, text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="insert", device='cuda') # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="substitute", device='cuda') # aug = naw.WordEmbsAug( # model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin', # action="substitute") aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) # text = aug.augment(text) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) # text = aug.augment(text) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) # text = aug.augment(text) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def test_substitute_without_target_word(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.RandomWordAug(action='substitute') for text in texts: augmented_text = aug.augment(text) self.assertIn('_', augmented_text) self.assertNotEqual(text, augmented_text)
def __init__(self): aug0 = naw.RandomWordAug() aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") aug2 = naw.SynonymAug(aug_src='wordnet') aug3 = naw.SplitAug() aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert") self.augs = [aug0, aug1, aug2, aug3, aug4]
def augment(self, example): augs = [self.augs.augment(example) for _ in range(self.num_of_samples)] if self.swap: swap_aug = naw.RandomWordAug(action="swap") augs_ = list(augs) for i in augs_: for _ in range(self.num_of_samples): swapped = swap_aug.augment(i) augs.append(swapped) return augs
def test_crop(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.RandomWordAug(action='crop') for text in texts: orig_tokens = text.split(' ') augmented_text = aug.augment(text) aug_tokens = augmented_text.split(' ') self.assertGreater(len(orig_tokens), len(aug_tokens))
def test_empty_input_for_delete(self): texts = [''] aug = naw.RandomWordAug(action=Action.DELETE) for text in texts: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text) self.assertEqual(1, len(texts)) self.assertEqual(0, len(texts[0]))
def prepare_aug(): # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings neu_aug = [] neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")) # Synonym Augmenter, Substitute word by WordNet's synonym syn_aug = [] syn_aug.append(naw.SynonymAug(aug_src='wordnet')) syn_aug.append( naw.SynonymAug( aug_src='ppdb', model_path= '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr' )) # Antonym Augmenter ant_aug = [] ant_aug.append(naw.AntonymAug()) # Random Word Augmenter random_aug = [] random_aug.append(naw.RandomWordAug(action="swap")) random_aug.append(naw.RandomWordAug()) print('augmenter initialization finished ...') aug = [] aug.extend(neu_aug) aug.extend(syn_aug) aug.extend(ant_aug) aug.extend(random_aug) return aug
def test_random_word_delete(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.RandomWordAug() for text in texts: self.assertLess(0, len(text)) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(texts))
def test_swap(self): texts = ['The quick brown fox jumps over the lazy dog'] aug = naw.RandomWordAug(action=Action.SWAP) for text in texts: self.assertLess(0, len(text)) augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(texts))
def test_empty_input_for_swap(self): texts = ['', ' ', None] aug = naw.RandomWordAug(action="swap") for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')