コード例 #1
0
    def test_reset_top_k(self):
        original_aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path)
        original_top_k = original_aug.model.top_k

        new_aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path,
            top_k=original_top_k+1)
        new_top_k = new_aug.model.top_k

        self.assertEqual(original_top_k+1, new_top_k)
コード例 #2
0
ファイル: test_word_embs.py プロジェクト: zoeyhub/nlpaug
    def test_reset_top_k(self):
        original_aug = naw.WordEmbsAug(model_type='word2vec',
                                       model_path=os.environ.get("MODEL_DIR") +
                                       'GoogleNews-vectors-negative300.bin')
        original_top_k = original_aug.model.top_k

        new_aug = naw.WordEmbsAug(model_type='word2vec',
                                  model_path=os.environ.get("MODEL_DIR") +
                                  'GoogleNews-vectors-negative300.bin',
                                  top_k=original_top_k + 1)
        new_top_k = new_aug.model.top_k

        self.assertEqual(original_top_k + 1, new_top_k)
コード例 #3
0
ファイル: test_word_embs.py プロジェクト: wshBak/nlpaug
    def test_incorrect_model_type(self):
        with self.assertRaises(ValueError) as error:
            naw.WordEmbsAug(
                model_type='test_model_type',
                model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin')

        self.assertTrue('Model type value is unexpected.' in str(error.exception))
コード例 #4
0
    def test_incorrect_model_type(self):
        with self.assertRaises(ValueError) as error:
            naw.WordEmbsAug(
                model_type='test_model_type',
                model_path=self.word2vec_model_path)

        self.assertTrue('Model type value is unexpected.' in str(error.exception))
def augment_dataset(csv, model_dir):

    original = pd.read_csv(csv)
    """
    Conduct two process of augmentation
    1. Synonym augmentation
    2. Word Embedding augmemntation
    """

    syn_df = original.copy()
    syn_aug = naw.SynonymAug(aug_src='wordnet')

    # synonym augenter(simple version)
    for i, query in enumerate(syn_df.src):
        synonym = syn_aug.augment(query)
        syn_df.at[i, 'src'] = synonym

    #word embedding augmenter
    word_df = original.copy()
    embed_aug = naw.WordEmbsAug(model_type='fasttext',
                                model_path=model_dir +
                                '/wiki-news-300d-1M.vec',
                                action="insert")

    for i, query in enumerate(word_df.src):
        insertion = embed_aug.augment(query)
        word_df.at[i, 'src'] = insertion

    a1 = pd.catcat([original, syn_df])
    a2 = pd.concat([a1, word_df])

    a2.to_csv(os.path.join(model_dir, 'augmented.csv'), index=False)

    return a2
コード例 #6
0
def embeddings(df, model, context_action):
    aug = naw.WordEmbsAug(model_type=model,
                          model_path=r"PATH TO WORD2VEC MODEL",
                          action=context_action)

    print("STARTING EMBEDDINGS: ", context_action)
    text_augmentation(aug, df)
コード例 #7
0
ファイル: dataset_v2.py プロジェクト: jionie/Tweet
def augmentation(text, insert=False, substitute=False, swap=True, delete=True):
    augs = []

    if insert:
        aug = naw.WordEmbsAug(
            model_type='word2vec',
            model_path=
            '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin',
            action="insert")
        augs.append(aug)

    if substitute:
        aug_sub = naw.SynonymAug(aug_src='wordnet')
        augs.append(aug_sub)

    if swap:
        aug_swap = naw.RandomWordAug(action="swap")
        augs.append(aug_swap)

    if delete:
        aug_del = naw.RandomWordAug()
        augs.append(aug_del)

    aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
    # print("before aug:", text)
    text = aug.augment(text, n=1)
    # print("after aug:", text)

    return text
コード例 #8
0
ファイル: test_flow.py プロジェクト: xingbow/nlpaug
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
コード例 #9
0
ファイル: test_word_embs.py プロジェクト: rhtrht/nlpaug
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.insert_augmenters = [
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ.get("MODEL_DIR") +
                            'GoogleNews-vectors-negative300.bin',
                            action=Action.INSERT),
            naw.WordEmbsAug(model_type='glove',
                            model_path=os.environ.get("MODEL_DIR") +
                            'glove.6B.50d.txt',
                            action=Action.INSERT),
            naw.WordEmbsAug(model_type='fasttext',
                            model_path=os.environ.get("MODEL_DIR") +
                            'wiki-news-300d-1M.vec',
                            action=Action.INSERT),
        ]

        cls.substitute_augmenters = [
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ.get("MODEL_DIR") +
                            'GoogleNews-vectors-negative300.bin',
                            action=Action.SUBSTITUTE),
            naw.WordEmbsAug(model_type='glove',
                            model_path=os.environ.get("MODEL_DIR") +
                            'glove.6B.50d.txt',
                            action=Action.SUBSTITUTE),
            naw.WordEmbsAug(model_type='fasttext',
                            model_path=os.environ.get("MODEL_DIR") +
                            'wiki-news-300d-1M.vec',
                            action=Action.SUBSTITUTE),
        ]
コード例 #10
0
    def test_case_insensitive(self):
        retry_cnt = 10

        text = 'Good'
        aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path,
            top_k=2)

        for _ in range(retry_cnt):
            augmented_text = aug.augment(text)
            self.assertNotEqual(text.lower(), augmented_text.lower())

        self.assertLess(0, retry_cnt)
コード例 #11
0
ファイル: test_word_embs.py プロジェクト: zoeyhub/nlpaug
    def test_case_insensitive(self):
        retry_cnt = 10

        text = 'Good'
        aug = naw.WordEmbsAug(model_type='word2vec',
                              model_path=os.environ.get("MODEL_DIR") +
                              'GoogleNews-vectors-negative300.bin',
                              top_k=2)

        for _ in range(retry_cnt):
            augmented_text = aug.augment(text)
            self.assertNotEqual(text.lower(), augmented_text.lower())

        self.assertLess(0, retry_cnt)
コード例 #12
0
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            naw.RandomWordAug(action="delete", stopwords=stopwords),
            naw.ContextualWordEmbsAug(stopwords=stopwords),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path, stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
コード例 #13
0
ファイル: test_word.py プロジェクト: makcedward/nlpaug
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.word2vec_model_path = os.path.join(
            os.environ.get("MODEL_DIR"), 'word', 'word_embs',
            'GoogleNews-vectors-negative300.bin')
        cls.word2vec_model = naw.WordEmbsAug(
            model_type='word2vec', model_path=cls.word2vec_model_path)
        cls.context_word_embs_model = naw.ContextualWordEmbsAug()

        cls.tfidf_model_path = os.path.join(os.environ.get("MODEL_DIR"),
                                            'word', 'tfidf')

        cls._train_tfidf(cls)
コード例 #14
0
    def test_stopwords_regex(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

        augs = [
            naw.RandomWordAug(action="delete", stopwords_regex=stopwords_regex),
            naw.ContextualWordEmbsAug(stopwords_regex=stopwords_regex),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path,
                            stopwords_regex=stopwords_regex)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
コード例 #15
0
ファイル: test_word.py プロジェクト: wangjksjtu/nlpaug
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            naw.RandomWordAug(action="delete", stopwords=stopwords),
            naw.ContextualWordEmbsAug(stopwords=stopwords),
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin',
                            stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
コード例 #16
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        augs = [
            naw.RandomWordAug(),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path),
            naw.ContextualWordEmbsAug(
                model_path='xlnet-base-cased', action="substitute", device='cpu')
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=num_thread, num_thread=num_thread)
                if num_thread == 1:
                    # return string
                    self.assertTrue(isinstance(augmented_data, str))
                else:
                    self.assertEqual(len(augmented_data), num_thread)
コード例 #17
0
    def augmentation(self,
                     text,
                     insert=False,
                     substitute=False,
                     swap=True,
                     delete=True):

        augs = []

        if insert:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="insert", device='cuda')
            # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
            aug = naw.WordEmbsAug(
                model_type='word2vec',
                model_path=
                '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin',
                action="insert")
            augs.append(aug)

        if substitute:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="substitute", device='cuda')
            # aug = naw.WordEmbsAug(
            #     model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin',
            #     action="substitute")
            aug_sub = naw.SynonymAug(aug_src='wordnet')
            augs.append(aug_sub)
            # text = aug.augment(text)

        if swap:
            aug_swap = naw.RandomWordAug(action="swap")
            augs.append(aug_swap)
            # text = aug.augment(text)

        if delete:
            aug_del = naw.RandomWordAug()
            augs.append(aug_del)
            # text = aug.augment(text)

        aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
        # print("before aug:", text)
        text = aug.augment(text, n=1)
        # print("after aug:", text)

        return text
コード例 #18
0
    def fn_word_emb(self):
        product_choices = list(
            product(self.database_choice, self.action_choices,
                    self.aug_p_choices))
        for item in product_choices:
            aug = naw.WordEmbsAug(model_type=item[0][1],
                                  model_path=item[0][2],
                                  action=item[1],
                                  aug_p=item[2],
                                  stopwords=self.stopwords)
            print("\nmodelname-action-words augmented: {}-{}-{}\n".format(
                item[0][0], item[1], item[2]))
            augmented_text = aug.augment(text, n=self.n_words)
            print(augmented_text, "\n")

            self.write_excel(augmented_text, item, item[1], item[2])
        self.workbook.close()
コード例 #19
0
ファイル: test_word.py プロジェクト: PankajMehar/nlpaug
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        n = 3
        augs = [
            naw.RandomWordAug(),
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ["MODEL_DIR"] +
                            'GoogleNews-vectors-negative300.bin'),
            naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                      action="substitute",
                                      skip_unknown_word=True,
                                      temperature=0.7,
                                      device='cpu')
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
コード例 #20
0
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

import swifter

df = pd.read_csv("../data/train.csv").iloc[:, 1:]

aug = naw.WordEmbsAug(model_type="word2vec",
                      model_path="../data/aug/GoogleNews-vectors-negative300",
                      action="insert")

df_copy = df.copy()
# df_copy["description"] = df_copy["description"].apply(lambda x: aug.augment(x))
df_copy["description"] = df_copy["description"].swifter.apply(aug.augment)

print(0)

df_copy.to_csv("../data/train_augmented.csv", index=False)
コード例 #21
0
ファイル: test_word_embs.py プロジェクト: rhtrht/nlpaug
 def test_incorrect_model_type(self):
     with self.assertRaises(ValueError):
         naw.WordEmbsAug(model_type='test_model_type',
                         model_path=os.environ.get("MODEL_DIR") +
                         'GoogleNews-vectors-negative300.bin',
                         action=Action.INSERT)
コード例 #22
0
    def tokenize(self):

        # This method rewrite the TRIZ summary in accordance with the will of the user (only one part of the contradiction in the summary or both parts)
        # It also rewrites every part involved to add spaces between sentences and avoid problems during tokenization
        # Finally it tokenizes text+summary using Stanford Core NLP. The results are saved

        if self.args.mode == 'test':
            print("\n\n\nTokenization in progress...")
            if self.args.input_files is not None:
                files = self.args.input_files
            else:
                addon = '/*' * (self.args.depth_directory + 1)
                files = sorted(
                    glob.glob(self.data_path + addon + self.args.input))

            count = 0
            pool = Pool(self.args.n_cpus)
            for _ in pool.imap_unordered(clean_text, files):
                # Print % processed files
                #########################################################
                sys.stdout.write('\r')
                # the exact output you're looking for:
                j = (count + 1) / len(files)
                sys.stdout.write("[%-20s] %d%%" % ('=' * int(20 * j), 100 * j))
                sys.stdout.flush()
                count += 1
                #########################################################

            pool.close()
            pool.join()

            extracted_patents_dir = os.path.abspath(self.data_path)
            tokenized_patents_dir = os.path.abspath(self.temp_path + '/test/' +
                                                    self.args.input)

            print("Preparing to tokenize %s to %s..." %
                  (extracted_patents_dir, tokenized_patents_dir))

            # make IO list file
            print("Making list of files to tokenize...")
            with open("mapping_for_corenlp.txt", "w") as f:
                for s in files:
                    f.write("%s\n" % (s))
            command = [
                'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
                '-annotators', 'tokenize,ssplit',
                '-ssplit.newlineIsSentenceBreak', 'always', '-filelist',
                'mapping_for_corenlp.txt', '-outputFormat', 'json',
                '-outputDirectory', tokenized_patents_dir
            ]
            print("Tokenizing %i files in %s and saving in %s..." %
                  (len(files), extracted_patents_dir, tokenized_patents_dir))
            subprocess.call(command)

            os.remove("mapping_for_corenlp.txt")

            print("Stanford CoreNLP Tokenizer has finished.")
            print("Successfully finished tokenizing %s to %s.\n" %
                  (extracted_patents_dir, tokenized_patents_dir))

        else:
            pool = Pool(self.args.n_cpus)
            for path in self.data_path:
                corpus_type = path.split('/')[-1]

                patents_directories = sorted(glob.glob(path + '/*'))
                for patent_directory in patents_directories:
                    found_summary = False
                    patent_files = sorted(glob.glob(patent_directory + '/*'))
                    for patent_file in patent_files:
                        if patent_file.find('.SUM') >= 0 and patent_file.find(
                                'SUMMARY') < 0:
                            found_summary = True
                    if not found_summary:
                        with open(
                                patent_directory + '/' +
                                os.path.basename(patent_directory) + '.SUM',
                                'w') as f:
                            f.write('')

                # Data augmentation with double translation
                # Add spaces between sentences in all used texts including summaries to avoid problems during tokenization
                #########################################################################################################################

                to_augment = []
                count_augment = 0

                for tipe in ["SUM"] + self.parts_of_interest:
                    files = sorted(glob.glob(path + '/*/*.' + tipe))
                    count = 0

                    if tipe != "SUM":
                        print()
                        logging.info("Cleaning data...")
                        count = 0

                        for _ in pool.imap_unordered(clean_text, files):
                            # Print % processed files
                            #########################################################
                            sys.stdout.write('\r')
                            # the exact output you're looking for:
                            j = (count + 1) / len(files)
                            sys.stdout.write("[%-20s] %d%%" %
                                             ('=' * int(20 * j), 100 * j))
                            sys.stdout.flush()
                            count += 1
                            #########################################################

                    if self.args.data_augmentation != "None" and corpus_type == 'train':
                        print("\n" + corpus_type +
                              " set: Data augmentation in progress for " +
                              tipe + " parts...")
                    else:
                        print("\n" + corpus_type +
                              " set: Sentences verification for SUM parts...")

                    for num_file, file in enumerate(files):

                        if tipe == "SUM":
                            with open(file, "r", encoding='utf-8') as f:
                                # print(file)
                                data = ''
                                for sentence in f:
                                    if sentence.find("STATE_OF_THE_ART") >= 0:
                                        continue
                                    else:
                                        data += sentence

                                param_sents = data.split("///")
                                if param_sents[0] != 'empty':
                                    to_augment.append(num_file)
                                first_param_sents = param_sents[0].replace(
                                    "\n", "").split("//")
                                if len(param_sents) > 1:
                                    second_param_sents = param_sents[
                                        1].replace("\n", "").split("//")
                                else:
                                    second_param_sents = []

                        # Print % processed files
                        #########################################################
                        sys.stdout.write('\r')
                        # the exact output you're looking for:
                        j = (count + 1) / len(files)
                        sys.stdout.write("[%-20s] %d%%" %
                                         ('=' * int(20 * j), 100 * j))
                        sys.stdout.flush()
                        count += 1
                        #########################################################

                        # Data augmentation with double translation
                        if self.args.data_augmentation != "None" and corpus_type == 'train' and num_file in to_augment:

                            count_augment += 1
                            if self.args.data_augmentation == "transformation" and self.args.transformation_type == "bert_embeddings":
                                aug = naw.ContextualWordEmbsAug(
                                    model_path='bert-base-uncased',
                                    action="substitute")
                            elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "word2vec_embeddings":
                                aug = naw.WordEmbsAug(
                                    model_type='word2vec',
                                    model_path=
                                    './word2vec/GoogleNews-vectors-negative300.bin'
                                )
                            elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "synonyms":
                                aug = naw.SynonymAug()

                            path_augmented_text = file.split('/')
                            path_augmented_text[-2] += 'b'
                            path_augmented_text[-1] = '.'.join([
                                path_augmented_text[-2],
                                path_augmented_text[-1].split('.')[-1]
                            ])
                            path_new_directory = '/'.join(
                                path_augmented_text[:-1])
                            path_augmented_text = '/'.join(path_augmented_text)

                            if os.path.isfile(
                                    path_augmented_text) or file.find('b') > 0:
                                continue

                            if self.args.data_augmentation == "translation":
                                # Not to exceed google translations quotas
                                time.sleep(1.25)

                            augmented_text = ''
                            if tipe != "SUM":
                                for sentence in data.split('.'):
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '.'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(".",
                                                                    " ") + '.'
                                        # print("ok2")
                            elif first_param_sents[0] != 'empty':
                                for sentence in first_param_sents:
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '. //'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(
                                                ".", " ") + '. //'
                                        # print("ok3")
                                augmented_text += '/'
                                for sentence in second_param_sents:
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '. //'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(
                                                ".", " ") + '. //'
                                        # print("ok4")
                                augmented_text = augmented_text[:-3]

                            augmented_text = augmented_text.replace(".", ". ")
                            augmented_text = ' '.join(augmented_text.split())

                            # Write translation
                            try:
                                os.mkdir(path_new_directory)
                            except:
                                pass

                            with open(path_augmented_text, 'w') as f:
                                f.write(augmented_text[:-1])

                #########################################################################################################################
                # Rewriting of summaries with chosen sentences/parameters (one side of the contradiction or both)
                #########################################################################################################################""
                data_analyzer = summary_preparation(path + '/')

                (path_state_of_the_art,
                 summary) = data_analyzer.get_data('both')
                for num in range(0, len(summary[0])):
                    summary_patent_first = ''
                    summary_patent_second = ''
                    for x in range(0, len(summary[0][num])):
                        summary_patent_first += (summary[0][num][x] + ' ')
                    with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ',
                              "w") as file:
                        file.write(summary_patent_first)

                    for x in range(0, len(summary[1][num])):
                        summary_patent_second += (summary[1][num][x] + ' ')
                    with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ2',
                              "w") as file:
                        file.write(summary_patent_second)

                # except:
                #     print("No summaries provided for "+corpus_type+' files.')
                #     time.sleep(1)
                #########################################################################################################################""

                # Tokenization using Standford Core NLP
                #########################################################################################################################

                add_on = ["SUMTRIZ", "SUMTRIZ2"]

                for tipe in self.parts_of_interest + add_on:
                    print("\n\n\nTokenization in progress...")
                    files = sorted(glob.glob(path + '/*/*.' + tipe))
                    print(
                        str(len(files)) + " " + tipe + " found for " +
                        corpus_type + " set.")

                    extracted_patents_dir = os.path.abspath(path)
                    tokenized_patents_dir = os.path.abspath(self.temp_path +
                                                            '/' + corpus_type +
                                                            '/' + tipe)

                    print("Preparing to tokenize %s to %s..." %
                          (extracted_patents_dir, tokenized_patents_dir))
                    stories = os.listdir(extracted_patents_dir)
                    # make IO list file
                    print("Making list of files to tokenize...")
                    with open("mapping_for_corenlp.txt", "w") as f:
                        for s in files:
                            f.write("%s\n" % (s))
                    command = [
                        'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
                        '-annotators', 'tokenize,ssplit',
                        '-ssplit.newlineIsSentenceBreak', 'always',
                        '-filelist', 'mapping_for_corenlp.txt',
                        '-outputFormat', 'json', '-outputDirectory',
                        tokenized_patents_dir
                    ]
                    print("Tokenizing %i files in %s and saving in %s..." %
                          (len(stories), extracted_patents_dir,
                           tokenized_patents_dir))
                    subprocess.call(command)

                    os.remove("mapping_for_corenlp.txt")

                #########################################################################################################################

            print("Stanford CoreNLP Tokenizer has finished.")
            print("Successfully finished tokenizing %s to %s.\n" %
                  (extracted_patents_dir, tokenized_patents_dir))

            pool.close()
            pool.join()
コード例 #23
0
def word2vec_aug(text):
  aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='/spell/leftout/GoogleNews-vectors-negative300.bin',
    action="substitute")
  augmented_text = aug.augment(text)
  return augmented_text
コード例 #24
0
    if answers.get('choice') == 'Plot embedding space - TensorboardX':
        writer = SummaryWriter('tensorboard/embeddings')
        writer.add_embedding(pretrained_embeddings, metadata=dataset.TEXT.vocab.itos, tag='Embedding')
        writer.close()
        print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard')

    if answers.get('choice') == 'More info about dataset':
        dataset.dataset_info()
        dataset.print_dataset_details()

    if answers.get('choice') == 'Evaluate and plot PR curves - TensorboardX':
        model.load_state_dict(torch.load('ezmath-model_83.pt'))
        test_loss, test_acc = train.evaluate_with_pr_plotting(model, test_iterator, criterion, dataset.LABEL.vocab.itos)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%')
        print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard')

    if answers.get('choice') == 'Test Textual Augmenter by word2vec similarity':
        print('Loading Word2Vec model...')
        aug = naw.WordEmbsAug(
            model_type='word2vec', model_path='vector_cache/word2vec_CoNLL17/model.bin',
            action="substitute")
        text = input("Please insert the exercise text to augment: ")
        augmented_text = aug.augment(text)
        print("Original:")
        print(text)
        print("Augmented Text:")
        print(augmented_text)

    answers = inquirer.prompt(questions)
コード例 #25
0
ファイル: test_word_embs.py プロジェクト: wshBak/nlpaug
    def setUpClass(cls):
        env_config_path = os.path.abspath(os.path.join(
            os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        model_dir = os.environ.get("MODEL_DIR")

        full_test_case = False

        cls.augmenters = [
            naw.WordEmbsAug(model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin'),
            naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.6B.50d.txt'),
            naw.WordEmbsAug(model_type='fasttext', model_path=model_dir + 'wiki-news-300d-1M.vec')
        ]

        if full_test_case:
            cls.augmenters.extend([
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.42B.300d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.840B.300d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.25d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.50d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.100d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.200d.txt'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'wiki-news-300d-1M-subword.vec'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M.vec'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M-subword.vec'),
            ])
コード例 #26
0
from tqdm import tqdm

SAMPLE_NUM = 1000
model_dir = 'model/'
des_dir = 'sentence/'
data_need_dir = des_dir + 'need_sentence.csv'
data_novel_dir = des_dir + 'novel_sentence.xlsx'
data_need_aug_dir = des_dir + 'need_aug_sentence.csv'
data_need_all_dir = des_dir + 'need_all_sentence.csv'

data_need = pd.read_csv(data_need_dir, index_col=0)

augs = [
    # Substitute word by word2vec similarity
    naw.WordEmbsAug(model_type='word2vec',
                    model_path=model_dir +
                    'GoogleNews-vectors-negative300.bin',
                    action="substitute"),
    # Substitute word by contextual word embeddings (BERT)
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                              action="substitute"),
    # Substitute word by WordNet's synonym
    naw.SynonymAug(aug_src='wordnet'),
    # Substitute word by PPDB's synonym
    naw.SynonymAug(aug_src='ppdb', model_path=model_dir + 'ppdb-2.0-s-all')
]

trans = BackTranslation(
    url=[
        'translate.google.com',
        #'translate.google.co.kr',
        'translate.google.cn',