def test_empty_input_substitute(self): texts = ['', ' '] self.word2vec_model.action = 'substitute' self.context_word_embs_model.action = 'substitute' augs = [ naw.SpellingAug(), naw.AntonymAug(), naw.RandomWordAug(action='substitute'), naw.SynonymAug(aug_src='wordnet'), naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute"), self.word2vec_model, self.context_word_embs_model ] for aug in augs: for text in texts: augmented_text = aug.augment(text) self.assertTrue(augmented_text is None or augmented_text.strip() == '') augmented_texts = aug.augment(texts) for augmented_text in augmented_texts: self.assertTrue(augmented_text is None or augmented_text.strip() == '')
def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) data = 'I love McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Love', 'I', 'McDonalds'], augmented_tokens) data = 'He loves McDonalds' doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 1, 0, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 0, 1, 1).get_augmented_tokens() self.assertEqual(['Loves', 'he', 'McDonalds'], augmented_tokens) doc = Doc(data, aug.tokenizer(data)) augmented_tokens = aug.change_case(doc, 2, 1, 1).get_augmented_tokens() self.assertEqual(['He', 'McDonalds', 'loves'], augmented_tokens) # Insert aug = naw.TfIdfAug(model_path=self.tfidf_model_path, action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug() self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def antonym_subsi(text): #Antonym Augmenter #Substitute word by antonym aug = naw.AntonymAug() attacked_text = aug.augment(text) print("Attacked Text:") print(attacked_text)
def test_non_strip_input(self): text = ' Good boy ' augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text)
def test_skip_punctuation(self): text = '. . . . ! ? # @' augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_skip_punctuation(self): text = '. . . . ! ? # @' augs = [ # naw.ContextualWordEmbsAug(action='insert'), # After using convert_tokens_to_ids and decode function, it cannot keep it original format. naw.AntonymAug(), naw.TfIdfAug(model_path=self.tfidf_model_path, action="substitute") ] for aug in augs: augmented_text = aug.augment(text) self.assertEqual(text, augmented_text)
def test_excessive_space(self): # https://github.com/makcedward/nlpaug/issues/48 text = 'The quick brown fox jumps over the lazy dog . 1 2 ' expected_result = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '1', '2'] augs = [ naw.ContextualWordEmbsAug(action='insert'), naw.AntonymAug(), naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute") ] for aug in augs: tokenized_text = aug._tokenizer(text) self.assertEqual(tokenized_text, expected_result)
def test_case(self): # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0)) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 0, 1)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 1, 0)) self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1)) self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1)) # Insert aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert') expected = False for i in range(10): augmented_text = aug.augment('Good') if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': expected = True break self.assertTrue(expected) # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Abc love': expected = True break self.assertTrue(expected) aug = naw.AntonymAug() self.assertEqual('Unhappy', aug.augment('Happy')) # Do not change if target word is non-lower aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') self.assertEqual('RE', aug.augment('Re')) # Delete case aug = naw.RandomWordAug(action='delete') expected = False for i in range(10): augmented_text = aug.augment('I love') if augmented_text == 'Love': expected = True break self.assertTrue(expected)
def __init__(self): antAug = naw.AntonymAug() synAug = naw.SynonymAug(aug_src='wordnet') embAug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") self.model_dict = { 0: antAug, 1: synAug, 2: embAug } self.output_data = { 'Sentence1': [], 'Sentence2': [], 'Label': [] }
def prepare_aug(): # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings neu_aug = [] neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")) neu_aug.append( naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")) # Synonym Augmenter, Substitute word by WordNet's synonym syn_aug = [] syn_aug.append(naw.SynonymAug(aug_src='wordnet')) syn_aug.append( naw.SynonymAug( aug_src='ppdb', model_path= '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr' )) # Antonym Augmenter ant_aug = [] ant_aug.append(naw.AntonymAug()) # Random Word Augmenter random_aug = [] random_aug.append(naw.RandomWordAug(action="swap")) random_aug.append(naw.RandomWordAug()) print('augmenter initialization finished ...') aug = [] aug.extend(neu_aug) aug.extend(syn_aug) aug.extend(ant_aug) aug.extend(random_aug) return aug
def setUpClass(cls): env_config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env')) load_dotenv(env_config_path) cls.augs = [naw.AntonymAug()]
import os import nlpaug.augmenter.word as naw from nlpaug.util import Action os.environ["MODEL_DIR"] = '../model' model_dir = os.environ.get("MODEL_DIR") aug = naw.AntonymAug() _text = 'The quick brown fox jumps over the lazy dog' augmented_text = aug.augment(_text) print("Original:") print(_text) print("Antonym Text:") print(augmented_text) aug = naw.SynonymAug(aug_src='wordnet') text = 'The quick brown fox jumps over the lazy dog.' augmented_text = aug.augment(text) print("Original:") print(text) print("Synonym Text:") print(augmented_text) aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute") text = 'The quick brown fox jumps over the lazy dog.' augmented_text = aug.augment(text) print("Original:") print(text) print("BeRT Embed Text:") print(augmented_text)
import nlpaug.augmenter.word as nas from util import * if __name__ == '__main__': random_seed(random.randint(0, 100000)) # create configuration file config = Config() # create text embedder embedder = Embedder(config.embedding_length) augmenter = None if config.augment: augmenter = nas.AntonymAug() # preprocess data and create wos2class.text.json and wos2class.train.json if not config.use_existing_data: data_manager = DataManager(config, augmenter) data_manager.preprocess_data() data_manager.create_train_test_jsonfile() data_manager.count_labels() # create dataset and dataloaders train_dataset = WOSDataset(config, embedder, is_train=True) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) train_iter = iter(train_dataloader) test_dataset = WOSDataset(config, embedder, is_train=False) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size, shuffle=True)