Example #1
0
    def test_multilingual(self):
        # import nltk
        # nltk.download('omw')
        # French
        text = 'chien'
        expected_texts = [
            'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc',
            'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard',
            'talon', 'chienchien', 'quignon', 'chien de chasse'
        ]
        aug = naw.SynonymAug(aug_src='wordnet', lang='fra')
        augmented_text = aug.augment(text)
        self.assertTrue(augmented_text in expected_texts)

        expected_texts = [
            'toutou', 'maître chien', 'clébard', 'dog', 'chienne', 'chiens',
            'chiot', 'cynophiles', 'clebs'
        ]
        model_path = os.path.join(os.environ.get("MODEL_DIR"), 'word', 'ppdb',
                                  'ppdb-1.0-s-lexical-french')
        aug = naw.SynonymAug(aug_src='ppdb', model_path=model_path)
        augmented_text = aug.augment(text)
        self.assertTrue(augmented_text in expected_texts)

        # Spanish
        text = 'Un rápido zorro marrón salta sobre el perro perezoso'
        aug = naw.SynonymAug(aug_src='wordnet', lang='spa')
        for _ in range(10):
            augmented_text = aug.augment(text)
            if augmented_text != text:
                break

        self.assertNotEqual(augmented_text, text)
Example #2
0
    def test_reload(self):
        text = 'The quick brown fox jumps over the lazy dog'

        aug = naw.SynonymAug(aug_src='wordnet')
        self.assertNotEqual(text, aug.augment(text))

        model_path = os.path.join(os.environ.get("MODEL_DIR"), 'word', 'ppdb', 'ppdb-2.0-s-all')
        aug2 = naw.SynonymAug(aug_src='ppdb', model_path=model_path)
        self.assertNotEqual(text, aug2.augment(text))
Example #3
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.augs = [
            naw.SynonymAug(aug_src='wordnet'),
            naw.SynonymAug(aug_src='ppdb',
                           model_path=os.environ.get("MODEL_DIR") +
                           'ppdb-2.0-s-all.txt')
        ]
Example #4
0
class QuestionGenerator:
    """Class contains logic for augmenting text"""

    aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased",
                                    action="substitute")
    aug_single = naw.SynonymAug(aug_src="wordnet")

    @staticmethod
    def augment(text):
        """
        checks whether to apply synonym or contextual augmentation

        :param text:
        :return:
        """
        tokens = split_sentence(re.sub("[^a-zA-Z0-9 ]+", "", text))
        if len(tokens) > 1:
            return QuestionGenerator.aug.augment(text, n=10, num_thread=4)
        else:
            return QuestionGenerator.aug_single.augment(tokens[0], n=10)

    @staticmethod
    async def generateQuestions(texts):
        """
        generates a list of variations for a given sentence/question

        :param texts: list of text
        :return: list of variations
        """
        if type(texts) == str:
            texts = [texts]

        result = [QuestionGenerator.augment(text) for text in texts]

        return sum(result, [])
Example #5
0
    def test_empty_input_substitute(self):
        texts = ['', '           ']

        self.word2vec_model.action = 'substitute'
        self.context_word_embs_model.action = 'substitute'

        augs = [
            naw.SpellingAug(),
            naw.AntonymAug(),
            naw.RandomWordAug(action='substitute'),
            naw.SynonymAug(aug_src='wordnet'),
            naw.TfIdfAug(model_path=self.tfidf_model_path,
                         action="substitute"), self.word2vec_model,
            self.context_word_embs_model
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None
                                or augmented_text.strip() == '')
Example #6
0
class QuestionGenerator:
    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                    action="substitute")
    aug_single = naw.SynonymAug(aug_src='wordnet')

    @staticmethod
    def augment(text):
        tokens = split_sentence(re.sub('[^a-zA-Z0-9 ]+', '', text))
        if len(tokens) > 1:
            return QuestionGenerator.aug.augment(text, n=10, num_thread=4)
        else:
            return QuestionGenerator.aug_single.augment(tokens[0], n=10)

    @staticmethod
    async def generateQuestions(texts):
        """ This function generates a list of variations for a given sentence/question.
            E.g. await QuestionGenerator.generateQuestions('your question') will return the list
            of variations for that particular question """

        if type(texts) == str:
            texts = [texts]

        result = [QuestionGenerator.augment(text) for text in texts]

        return sum(result, [])
Example #7
0
def augmentation(text, insert=False, substitute=False, swap=True, delete=True):
    augs = []

    if insert:
        aug = naw.WordEmbsAug(
            model_type='word2vec',
            model_path=
            '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin',
            action="insert")
        augs.append(aug)

    if substitute:
        aug_sub = naw.SynonymAug(aug_src='wordnet')
        augs.append(aug_sub)

    if swap:
        aug_swap = naw.RandomWordAug(action="swap")
        augs.append(aug_swap)

    if delete:
        aug_del = naw.RandomWordAug()
        augs.append(aug_del)

    aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
    # print("before aug:", text)
    text = aug.augment(text, n=1)
    # print("after aug:", text)

    return text
def augment_dataset(csv, model_dir):

    original = pd.read_csv(csv)
    """
    Conduct two process of augmentation
    1. Synonym augmentation
    2. Word Embedding augmemntation
    """

    syn_df = original.copy()
    syn_aug = naw.SynonymAug(aug_src='wordnet')

    # synonym augenter(simple version)
    for i, query in enumerate(syn_df.src):
        synonym = syn_aug.augment(query)
        syn_df.at[i, 'src'] = synonym

    #word embedding augmenter
    word_df = original.copy()
    embed_aug = naw.WordEmbsAug(model_type='fasttext',
                                model_path=model_dir +
                                '/wiki-news-300d-1M.vec',
                                action="insert")

    for i, query in enumerate(word_df.src):
        insertion = embed_aug.augment(query)
        word_df.at[i, 'src'] = insertion

    a1 = pd.catcat([original, syn_df])
    a2 = pd.concat([a1, word_df])

    a2.to_csv(os.path.join(model_dir, 'augmented.csv'), index=False)

    return a2
Example #9
0
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10):
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    nltk.download('omw')
    flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()])

    train_afert_exp=[]
    dev_after_exp=[]

    for idx, row in dataset.iterrows():
        logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"]))
        new_text = [new for new in flow.augment(row["question"], n=expansion)]
        train_afert_exp.append({"label": row["question_id"], "text": row["question"]})
        th=int(len(new_text)*0.8)
        for text in new_text[:th]:
            train_afert_exp.append({"label": row["question_id"], "text": text})
        for text in new_text[th:]:
            dev_after_exp.append({"label": row["question_id"], "text": text})

    train=train_afert_exp
    dev=dev_after_exp

    train = pd.DataFrame(train).sample(frac=1.0)
    dev = pd.DataFrame(dev).sample(frac=1.0)

    return train, dev
Example #10
0
def augment_by_class(df, max_n):
    word_index = {}

    for phrase in df['Body']:
        words = phrase.split(' ')
        for word in words:
            word_index[word] = word_index.get(word, 0) + 1

    index_df = pd.DataFrame([{
        'token': i,
        'count': word_index[i]
    } for i in word_index])
    index_df = index_df[index_df['count'] >= 10]

    aug = naw.SynonymAug(stopwords=index_df['token'])
    aug2 = naw.RandomWordAug()
    factor = (max_n // len(df)) + 2

    result = set()
    for phrase in df['Body']:
        result.add(phrase)
        print(f'Augmenting for {phrase}')
        for item in aug.augment(phrase, n=factor):
            result.add(item)
        for item in aug2.augment(phrase, n=2):
            result.add(item)

    return list(result)
Example #11
0
def synonym_wordnet(text):
    #Synonym Augmenter
    #Substitute word by WordNet's synonym
    aug = naw.SynonymAug(aug_src='wordnet')
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)
Example #12
0
        def word_substitution(text, aug_src='wordnet'):
            # import nlpaug.flow as naf
            import nlpaug.augmenter.word as naw

            aug = naw.SynonymAug(aug_src=aug_src)
            augmented_text = aug.augment(text)
            return augmented_text
Example #13
0
    def test_multilingual(self):
        # French
        text = 'chien'
        expected_texts = [
            'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc',
            'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard',
            'talon', 'chienchien', 'quignon', 'chien de chasse'
        ]
        aug = naw.SynonymAug(aug_src='wordnet', lang='fra')
        augmented_text = aug.augment(text)
        self.assertTrue(augmented_text in expected_texts)

        # Spanish
        text = 'Un rápido zorro marrón salta sobre el perro perezoso'
        aug = naw.SynonymAug(aug_src='wordnet', lang='spa')
        augmented_text = aug.augment(text)
        self.assertNotEqual(augmented_text, text)
Example #14
0
def data_augment(corpus, label):
    syn_aug = naw.SynonymAug(aug_src="wordnet")
    rand_aug = naw.RandomWordAug(action="swap")
    data_struc = {'emotion_label': [], 'emotion_text': []}
    aug_dataframe = pd.DataFrame(data_struc)
    print('Augmenting data')
    for label, sentence in zip(label, corpus):
        if sentence.find("\n") > 0:
            sentence = sentence.replace("\n", "")

            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': sentence
                },
                ignore_index=True)

            augmented_sent = syn_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent
                },
                ignore_index=True)

            augmented_sent1 = rand_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent1
                },
                ignore_index=True)
        else:
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': sentence
                },
                ignore_index=True)
            augmented_sent = syn_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent
                },
                ignore_index=True)
            aug1 = naw.RandomWordAug(action="swap")
            augmented_sent1 = rand_aug.augment(sentence)
            aug_dataframe = aug_dataframe.append(
                {
                    'emotion_label': label,
                    'emotion_text': augmented_sent1
                },
                ignore_index=True)
    print('Augmentation Completed')
    return aug_dataframe['emotion_text'], aug_dataframe['emotion_label']
    def __init__(self):
        aug0 = naw.RandomWordAug()
        aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="substitute")
        aug2 = naw.SynonymAug(aug_src='wordnet')
        aug3 = naw.SplitAug()
        aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="insert")

        self.augs = [aug0, aug1, aug2, aug3, aug4]
Example #16
0
def prepare_aug():
    # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings
    neu_aug = []
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="insert"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='roberta-base',
                                  action="substitute"))

    # Synonym Augmenter, Substitute word by WordNet's synonym
    syn_aug = []
    syn_aug.append(naw.SynonymAug(aug_src='wordnet'))
    syn_aug.append(
        naw.SynonymAug(
            aug_src='ppdb',
            model_path=
            '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr'
        ))

    # Antonym Augmenter
    ant_aug = []
    ant_aug.append(naw.AntonymAug())

    # Random Word Augmenter
    random_aug = []
    random_aug.append(naw.RandomWordAug(action="swap"))
    random_aug.append(naw.RandomWordAug())

    print('augmenter initialization finished ...')
    aug = []
    aug.extend(neu_aug)
    aug.extend(syn_aug)
    aug.extend(ant_aug)
    aug.extend(random_aug)
    return aug
Example #17
0
def synonym_replacement(text, n=N):
    """
    Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its
    synonyms chosen at random.
    """
    aug = naw.SynonymAug(aug_src='wordnet',
                         aug_min=n,
                         aug_max=n,
                         stopwords=english_stopwords)
    augmented_text = aug.augment(text)
    return augmented_text
Example #18
0
def augment_data(
        num_new_class_0,
        num_new_class_1,
        clear_old_augmented_data=False,
        write_to_path='data/synonym_augmented_reddit_submissions.csv'):
    """
    Generates augmented data by producing new samples for class 0 and/or class 1, the two 
    classes that are underrepresented in our dataset, and writing them to a designated new
    file 'data/augmented_reddit_submissions.csv'.
    Takes in:
    - num_new_class_0: Integer representing how many new samples of class 0 to generate
    - num_new_class_1: Integer representing how many new samples of class 1 to generate
    - clear_old_augmented_data: Boolean; if set to True, will overwrite the old augmented data rather than 
    - write_to_path: The path of the file to write or append the new samples to.

    This function makes use of the nlpaug library's word augmenter. 
    """

    # We experimented with a couple other nlpaug models, but we ended up choosing SynonymAug
    # because it gave us the most natural-sounding and least noisy samples.
    # Other models we tried were:
    #   naw.WordEmbsAug             this one uses word2vec to find similar words for augmentation; it
    #                               ended up giving us very noisy data that made the performance of
    #                               all models decrease.
    #   naw.ContextualWordEmbsAug   this one uses BERT to do the same as the above; it was slightly
    #                               better, but still pretty noisy.
    aug = naw.SynonymAug(aug_src='wordnet')

    new_rows = []
    with open('data/reddit_submissions.csv') as f:
        reader = csv.reader(f)
        # Skip the first row that just has column names
        rows = list(reader)[1:]
        print('unfiltered rows: {}'.format(len(rows)))

        seed_rows_with_class_0 = list(
            filter(lambda r: CLASSES[r[0]] == 0, rows))
        seed_rows_with_class_1 = list(
            filter(lambda r: CLASSES[r[0]] == 1, rows))
        print('filtered rows: {}'.format(
            len(seed_rows_with_class_0) + len(seed_rows_with_class_1)))

        print('generating new data with class 0')
        create_new_rows(seed_rows_with_class_0, num_new_class_0, new_rows, aug)
        print('generating new data with class 1')
        create_new_rows(seed_rows_with_class_1, num_new_class_1, new_rows, aug)

    file_open_mode = 'w' if clear_old_augmented_data else 'a'

    with open(write_to_path, file_open_mode) as f:
        writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
        print('writing new rows')
        writer.writerows(new_rows)
Example #19
0
    def test_language(self):
        text = 'chien'

        expected_texts = [
            'cliquer', 'clic', 'aboyeur', 'hot dog', 'franc',
            'canis familiaris', 'achille', 'toutou', 'cliquet', 'clébard',
            'talon', 'chienchien', 'quignon', 'chien de chasse'
        ]
        aug = naw.SynonymAug(aug_src='wordnet', lang='fra')

        augmented_text = aug.augment(text)
        self.assertTrue(augmented_text in expected_texts)
Example #20
0
    def fn_synonym_replacement(self):
        product_choices = list(
            product(self.database_choice, self.aug_p_choices))
        for item in product_choices:
            aug = naw.SynonymAug(aug_src=item[0][1],
                                 aug_p=item[1],
                                 stopwords=self.stopwords)
            print("\nmodelname-action-words augmented: {}-{}-{}\n".format(
                item[0][0], "substitute", item[1]))
            augmented_text = aug.augment(text, n=self.n_words)
            print(augmented_text, "\n")

            self.write_excel(augmented_text, item, item[1])
        self.workbook.close()
Example #21
0
    def __init__(self,
                 aug_min=1,
                 aug_max=10,
                 aug_p=0.3,
                 tokenizer=None,
                 always_apply=False,
                 p=0.5):
        super().__init__(always_apply, p)

        self.aug = naw.SynonymAug(
            aug_min=aug_min,
            aug_max=aug_max,
            aug_p=aug_p,
            tokenizer=tokenizer,
        )
Example #22
0
    def augmentation(self,
                     text,
                     insert=False,
                     substitute=False,
                     swap=True,
                     delete=True):

        augs = []

        if insert:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="insert", device='cuda')
            # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
            aug = naw.WordEmbsAug(
                model_type='word2vec',
                model_path=
                '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin',
                action="insert")
            augs.append(aug)

        if substitute:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="substitute", device='cuda')
            # aug = naw.WordEmbsAug(
            #     model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin',
            #     action="substitute")
            aug_sub = naw.SynonymAug(aug_src='wordnet')
            augs.append(aug_sub)
            # text = aug.augment(text)

        if swap:
            aug_swap = naw.RandomWordAug(action="swap")
            augs.append(aug_swap)
            # text = aug.augment(text)

        if delete:
            aug_del = naw.RandomWordAug()
            augs.append(aug_del)
            # text = aug.augment(text)

        aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
        # print("before aug:", text)
        text = aug.augment(text, n=1)
        # print("after aug:", text)

        return text
Example #23
0
    def __init__(self):
        antAug = naw.AntonymAug()
        synAug = naw.SynonymAug(aug_src='wordnet')
        embAug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                           action="substitute")

        self.model_dict = {
            0: antAug,
            1: synAug,
            2: embAug
        }

        self.output_data = {
            'Sentence1': [],
            'Sentence2': [],
            'Label': []
        }
def augment_dataset(dataset, prob=0.3, augment_type='wordnet'):
    '''Augment data using wordnet'''
    data_targets = list(zip(dataset.data, dataset.target))
    sample_size = int(np.ceil(len(dataset.data) * prob))

    LOGGER.info(f'Augmenting {sample_size} datasets')
    sample_data = random.choices(data_targets, k=sample_size)

    if augment_type == 'wordnet':
        aug = naw.SynonymAug(aug_src='wordnet')

    aug_data, aug_targets = list(
        zip(*[(aug.augment(text), target) for text, target in sample_data]))

    dataset.data = [*dataset.data, *aug_data]
    dataset.target = [*dataset.target, *aug_targets]

    LOGGER.info(
        f'Total dataset size after augmentation: {len(dataset.target)}')

    return dataset
def main():
    print("Loading ppdb dataset...")
    aug = naw.SynonymAug(aug_p=0.5,
                         aug_src="ppdb",
                         model_path="../data/ppdb-2.0-tldr",
                         aug_max=100)
    print("Augmentor initialized")
    dataset = load_dataset('cnn_dailymail', "3.0.0")

    train_data = dataset['train']
    inputs = train_data['article']
    targets = train_data['highlights']
    ids = train_data['id']
    print(len(train_data))
    #    articles = [nltk.sent_tokenize(inp) for inp in tqdm(inputs)]
    #    pickle.dump(articles, open('articles_sentences.json', 'wb'))
    articles = pickle.load(open('articles_sentences.json', 'rb'))[:80000]

    d = {}
    for i, art in enumerate(tqdm(articles)):
        augmented = aug.augment(art)
        #        d[ids[i]] = ' '.join(augmented)
        d[ids[i]] = augmented


#    num_splits = 10
#    split = len(articles)//num_splits
#    pool = mp.Pool(processes=num_splits)
#    results = []
#    for i in range(num_splits):
#        data = articles[i*split:(i+1)*split] if i < num_splits-1 else articles[i*split:]
#        ids = ids[i*split:(i+1)*split] if i < num_splits - 1 else ids[i*split:]

#        results.append(pool.apply_async(paraphrase, args=(aug, data, ids)))
#    results = [pool.apply_async(paraphrase, args=(aug, articles[i*split:(i+1)*split] if i < num_splits-1 else articles[i*split:], ids[i*split:(i+1)*split] if i < num_splits-1 else ids[i*split:])) for i in range(num_splits)]

#   outputs = [p.get() for p in results]
#   for x in outputs:
#       d = {**d, **x}
    pickle.dump(d, open('ppdb_paraphrase.pkl', 'wb'))
Example #26
0
    def augment_text(self, data):
        op = random.choice(self.all_transform)

        # use specified operation magnitude if avalible
        if isinstance(op, tuple):
            op, scale = op
        else:
            scale = random.uniform(0, self.max_strength)

        if op == "identity":
            return data
        elif op == "syn_replacement":
            op = naw.SynonymAug(aug_src="wordnet", aug_p=scale, aug_max=None)
        elif op == "random_swap":
            op = naw.RandomWordAug(action="swap", aug_p=scale, aug_max=None)
        elif op == "random_delete":
            op = naw.RandomWordAug(action="delete", aug_p=scale, aug_max=None)
        elif op == "insert_punc":
            op = InsertPunctuation()  # scale will be randomized inside function
        else:
            raise NotImplementedError
        return op.augment(data)
def augment_n(data, N=1):
    pbar = tqdm(desc='Augmenting Data N={}'.format(N),
                total=data.shape[0],
                leave=False)

    # random synonym replacement
    # aug = naw.SynonymAug(aug_max=4, stopwords=stop_words())
    aug = naf.Sequential([
        # naw.ContextualWordEmbsAug(
        #     'bert-base-uncased',
        #     aug_max=5,
        #     stopwords=stop_words(),
        #     device='cuda',
        #     optimize=True
        # ),
        naw.ContextualWordEmbsAug('bert-base-uncased',
                                  aug_max=3,
                                  stopwords=stop_words(),
                                  device='cuda',
                                  optimize=True,
                                  action='insert'),
        naw.SynonymAug(aug_max=4, stopwords=stop_words())
    ])
    results = []
    for row in data:
        t, s = augment(row[1], row[2], aug, N)
        augs = []

        for j, t in enumerate(t):
            augs.append([row[0] + str(j), t, s[j], row[3]])
        if len(augs) > 0:
            results.append(np.array(augs))
        pbar.update()

    results.append(data)
    pbar.clear()
    pbar.close()
    return np.concatenate(results, axis=0)
Example #28
0
def get_augmenter(method: str, stopwords: List[str] = None) -> naw.SynonymAug:
    """
    Initialize an augmenter depending on the given method.

    Parameters
    ----------
    method : str (supported methods: wordnet_synonym and aug_sub_bert)
    stopwords : list
        list of words to freeze throughout the augmentation

    Returns
    -------
    Initialized nlpaug augmenter
    """
    if method == 'wordnet_synonym':
        return naw.SynonymAug(aug_src='wordnet', stopwords=stopwords)
    if method == 'aug_sub_bert':
        return naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="substitute",
                                         stopwords=stopwords)
    raise UnavailableAugmenter(
        'The given augmenter is not supported. You must choose one \
        of the following: wordnet_synonym or aug_sub_bert')
Example #29
0
def word_substitution(text, aug_src='wordnet'):
    aug = naw.SynonymAug(aug_src=aug_src)
    augmented_text = aug.augment(text)
    return augmented_text
Example #30
0
def augment(dataset_path,
            factor,
            balance_aware=False,
            alpha=0.7,
            beta=1,
            verbose=False):
    """Augments a dataset in place using nlpaug.

    Args:
        dataset (string): Path to training set.
        factor (int): Factor by which to augment training set size.
        balance_aware (bool): Whether to use balance-aware data augmentation (not fully tested yet).
        alpha (float): Alpha parameter in balance-aware data augmentation.
        beta (float): Beta parameter in balance-aware data augmentation.
        verbose (bool, optional): Verbose output. Defaults to False.
    """

    logging.basicConfig(level=logging.DEBUG,
                        format="[%(asctime)s:%(name)s] %(message)s")
    logger = logging.getLogger("augment")

    if not os.path.exists(dataset_path):
        if verbose:
            logger.info(
                f'Skipping training set augmentation, for {dataset_path} not found.'
            )
        return

    if factor < 2:
        if verbose:
            logger.info(
                f'Skipping training set augmentation, for factor is {factor} < 2.'
            )
        return

    if verbose:
        logger.info(f"Begin training set augmentation.")

    aug = naw.SynonymAug(aug_src='wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')

    golden = []
    with open(dataset_path) as fin:
        for line in fin:
            data = json.loads(line)
            golden.append(data)

    label_count = get_label_count(golden)

    if balance_aware:
        min_rareness, max_rareness, spread_factor = analyze_for_balance_awareness(
            golden, alpha, beta)
        if verbose:
            logger.info(
                f"Analyzing papers in {dataset_path} for balance-aware data augmentation."
            )
            logger.info(f"Minimum rareness score is {min_rareness}.")
            logger.info(f"Maximum rareness score is {max_rareness}.")
            logger.info(f"This allows a spread factor of {spread_factor}.")

        if verbose:
            logger.info(
                f"Augmenting dataset at {dataset_path} with {len(golden)} examples."
            )

    with open(dataset_path, 'w') as fout:
        for epoch in (tqdm(range(factor), desc='Full augmentation progress')
                      if verbose else range(factor)):
            for js in (tqdm(golden, desc='Per epoch progress', leave=False)
                       if verbose else golden):
                if epoch == 0:
                    fout.write(json.dumps(js) + '\n')
                else:
                    if balance_aware:
                        relative_rareness = floor(
                            rareness_score(js, label_count, alpha, beta) /
                            min_rareness)
                        if relative_rareness <= epoch:
                            continue
                    title = ' '.join(js['title'])
                    aug_title = aug.augment(title)
                    abstract = ' '.join(js['abstract'])
                    aug_abstract = aug.augment(abstract)

                    aug_js = copy.deepcopy(js)

                    aug_js['title'] = aug_title.split()
                    aug_js['abstract'] = aug_abstract.split()
                    fout.write(json.dumps(aug_js) + '\n')

    if verbose:
        logger.info(f"Finish training set augmentation.")