예제 #1
0
def fetch_test(args):
    exclusive_fetch_test = {
        'aihub_conversation_translation', 'aihub_decree_translation', 'aihub_government_website_translation',
        'aihub_korean_culture_translation', 'aihub_news_translation', 'aihub_spoken_translation', 'aihub_translation',
        'modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written'
    }
    for corpus_name in corpus_list:
        if corpus_name in exclusive_fetch_test:
            continue
        Korpora.fetch(corpus_name, root_dir=args.root_dir)
        time.sleep(0.5)
예제 #2
0
def load_modu_test(args):
    for corpus_name in ['modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written']:
        with suppress_stdout():
            corpus = Korpora.load(corpus_name, root_dir=args.root_dir)
        bar = '=' * 80
        print(corpus, end=f'\n\n{bar}\n\n', flush=True)
        time.sleep(0.5)
예제 #3
0
def test_usage():
    ner = Korpora.load('naver_changwon_ner')
    assert len(ner.train) == len(NaverChangwonNERKorpus().train)
    ner.train[0]
    assert len(ner.train) == 90000
    for example in ner.train:
        continue
예제 #4
0
파일: cli.py 프로젝트: lovit/KoBERTScore
def best_layer(args):
    print(
        f'Finding best performance BERT layer with {args.model_name_or_path}')

    if not -1 < args.rescale_base < 1:
        raise ValueError("`rescale_base` must be in [-1, 1]")
    if args.draw_plot and args.output_dir is None:
        raise ValueError('Set `output_dir` when use `draw_plot`')

    device = args.device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load pretrained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    encoder = BertModel.from_pretrained(args.model_name_or_path).to(device)

    # Load STS corpus
    corpus = Korpora.load('korsts')
    references = []
    candidates = []
    qualities = []
    for data in (corpus.train, corpus.dev):
        for example in data:
            references.append(example.text)
            candidates.append(example.pair)
            qualities.append(example.label)

    # Train IDF
    idf = train_idf(tokenizer, references)
    idf_embed = idf_numpy_to_embed(idf)

    # Find best layer
    model_name = args.model_name_or_path.split(os.path.sep)[-1]
    best_layer, informations = find_best_layer(tokenizer,
                                               encoder,
                                               references,
                                               candidates,
                                               qualities,
                                               idf=idf_embed,
                                               rescale_base=args.rescale_base,
                                               model_name=model_name,
                                               batch_size=args.batch_size)

    print(f'  - Best performance layer : {best_layer}')

    # Save figures
    if args.draw_plot:
        output_dir = args.output_dir
        if output_dir is None:
            output_dir = os.getcwd()
        dirname = os.path.abspath(output_dir)
        print(f'Saving figures at {dirname}')
        os.makedirs(dirname, exist_ok=True)
        warnings.filterwarnings("ignore")
        save(informations['figures']['R'], f'{dirname}/R.html')
        save(informations['figures']['P'], f'{dirname}/P.html')
        save(informations['figures']['F'], f'{dirname}/F.html')
        save(informations['figures']['RPF'], f'{dirname}/RPF.html')
예제 #5
0
def test_usage():
    ner = Korpora.load('naver_changwon_ner')
    assert len(ner.train) == len(NaverChangwonNERKorpus().train)
    ner.train[0]
    assert len(ner.train) == 90000
    print(f'str(korpus)\n{str(ner)}')
    print(f'str(korpus.train)\n{str(ner.train)}')
    for example in ner.train:
        continue
def test_usage():
    kornli = Korpora.load('kornli')
    assert len(kornli.snli_train) == len(KorNLIKorpus().snli_train)
    kornli.snli_train[0]
    assert len(kornli.snli_train) == 550152
    assert len(kornli.xnli_dev) == 2490
    assert len(kornli.xnli_test) == 5010
    assert len(kornli.multinli_train) == 392702
    for example in kornli.snli_train:
        continue
def test_usage():
    korean_hate_speech = Korpora.load('korean_hate_speech')
    korean_hate_speech_ = KoreanHateSpeechKorpus()
    assert len(korean_hate_speech.unlabeled) == len(korean_hate_speech_.unlabeled) == 2033893
    assert len(korean_hate_speech.test) == 974
    assert len(korean_hate_speech.dev) == 471
    assert len(korean_hate_speech.train) == 7896
    for data in [korean_hate_speech.train, korean_hate_speech.dev, korean_hate_speech.test, korean_hate_speech.unlabeled]:
        for _ in data:
            continue
예제 #8
0
def test_usage():
    koen_news = Korpora.load('korean_parallel_koen_news')
    koen_news_ = KoreanParallelKOENNewsKorpus()
    assert len(koen_news.train) == len(koen_news_.train) == 94123
    assert len(koen_news.dev) == 1000
    assert len(koen_news.test) == 2000
    print(f'str(korpus)\n{str(koen_news)}')
    print(f'str(korpus.train)\n{str(koen_news.train)}')
    for data in [koen_news.train, koen_news.dev, koen_news.test]:
        for _ in data:
            continue
def test_usage():
    petitions = Korpora.load('korean_petitions')
    assert len(petitions.train) == len(KoreanPetitionsKorpus().train)
    assert len(petitions.train) == 433631
    assert len(petitions.train[0].text) == 1491
    assert petitions.train[0].begin == '2017-08-25'
    assert petitions.train[0].end == '2017-09-24'
    assert petitions.train.titles[0] == petitions.train[0].title
    assert '청와대 국민청원' in petitions.description
    assert 'CC0 1.0 Universal' in petitions.license
    for petition in petitions.train:
        continue
def test_usage():
    nsmc = Korpora.load('nsmc')
    assert len(nsmc.train.texts) == 150000
    assert len(nsmc.train.labels) == 150000
    assert len(nsmc.test.texts) == 50000
    assert len(nsmc.test.labels) == 50000
    assert 'Naver sentiment movie corpus v1.0' in nsmc.description
    assert 'CC0 1.0 Universal' in nsmc.license
    assert len(NSMCKorpus().train) == len(nsmc.train)
    nsmc.train[0]
    for row in nsmc.train:
        continue
예제 #11
0
def load_small_test(args):
    exclusive_load_test = {
        'kcbert', 'kowikitext', 'namuwikitext', 'modu_messenger', 'modu_mp',
        'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written'
    }
    for corpus_name in corpus_list:
        if corpus_name in exclusive_load_test:
            continue
        with suppress_stdout():
            corpus = Korpora.load(corpus_name, root_dir=args.root_dir)
        bar = '=' * 80
        print(corpus, end=f'\n\n{bar}\n\n', flush=True)
        time.sleep(0.5)
예제 #12
0
def test_usage():
    pair = Korpora.load('question_pair')
    assert len(pair.train) == len(QuestionPairKorpus().train)
    pair.train[0]
    pair.test[0]
    assert len(pair.train) == 6888
    assert len(pair.test) == 688
    assert len(pair.get_all_pairs()) == 7576
    assert len(pair.get_all_labels()) == 7576
    for example in pair.train:
        continue
    for example in pair.test:
        continue
예제 #13
0
def test_usage():
    kornli = Korpora.load('kornli')
    assert kornli.exists()
    assert len(kornli.snli_train) == len(KorNLIKorpus().snli_train)
    kornli.snli_train[0]
    assert len(kornli.snli_train) == 550152
    assert len(kornli.xnli_dev) == 2490
    assert len(kornli.xnli_test) == 5010
    assert len(kornli.multinli_train) == 392702
    print(f'str(korpus)\n{str(kornli)}')
    print(f'str(korpus.multinli_train)\n{str(kornli.multinli_train)}')
    for example in kornli.snli_train:
        continue
예제 #14
0
def test_usage():
    korean_hate_speech = Korpora.load('korean_hate_speech')
    korean_hate_speech_ = KoreanHateSpeechKorpus()
    assert korean_hate_speech.exists()
    assert korean_hate_speech_.exists()
    assert len(korean_hate_speech.unlabeled) == len(korean_hate_speech_.unlabeled) == 2033893
    assert len(korean_hate_speech.test) == 974
    assert len(korean_hate_speech.dev) == 471
    assert len(korean_hate_speech.train) == 7896
    print(f'str(korpus)\n{str(korean_hate_speech)}')
    print(f'str(korpus.train)\n{str(korean_hate_speech.train)}')
    for data in [korean_hate_speech.train, korean_hate_speech.dev, korean_hate_speech.test, korean_hate_speech.unlabeled]:
        for _ in data:
            continue
예제 #15
0
def test_usage():
    chatbot_data = Korpora.load('korean_chatbot_data')
    assert len(chatbot_data.train) == len(KoreanChatbotKorpus().train)
    assert len(chatbot_data.train.texts) == 11823
    assert len(chatbot_data.train.pairs) == 11823
    assert len(chatbot_data.train.labels) == 11823
    assert len(chatbot_data.get_all_texts()) == 11823
    assert len(chatbot_data.get_all_pairs()) == 11823
    assert len(chatbot_data.get_all_labels()) == 11823
    assert 'Chatbot_data_for_Korean v1.0' in chatbot_data.description
    assert 'CC0 1.0 Universal' in chatbot_data.license
    chatbot_data.train[0]
    for example in chatbot_data.train:
        continue
예제 #16
0
def test_usage():
    pair = Korpora.load('question_pair')
    assert pair.exists()
    assert len(pair.train) == len(QuestionPairKorpus().train)
    pair.train[0]
    pair.test[0]
    assert len(pair.train) == 6888
    assert len(pair.test) == 688
    assert len(pair.get_all_pairs()) == 7576
    assert len(pair.get_all_labels()) == 7576
    print(f'str(korpus)\n{str(pair)}')
    print(f'str(korpus.train)\n{str(pair.train)}')
    for example in pair.train:
        continue
    for example in pair.test:
        continue
def test_usage():
    korsts = Korpora.load('korsts')
    assert len(korsts.train) == len(KorSTSKorpus().train)
    korsts.train[0]
    korsts.dev[0]
    korsts.test[0]
    assert len(korsts.train) == len(korsts.train.get_all_pairs()) == \
           len(korsts.train.get_all_labels()) == len(korsts.train.get_all_genres()) == \
           len(korsts.train.get_all_filenames()) == len(korsts.train.get_all_years()) == 5749
    assert len(korsts.dev) == len(korsts.dev.get_all_pairs()) == \
           len(korsts.dev.get_all_labels()) == len(korsts.dev.get_all_genres()) == \
           len(korsts.dev.get_all_filenames()) == len(korsts.dev.get_all_years()) == 1500
    assert len(korsts.test) == len(korsts.test.get_all_pairs()) == \
           len(korsts.test.get_all_labels()) == len(korsts.test.get_all_genres()) == \
           len(korsts.test.get_all_filenames()) == len(korsts.test.get_all_years()) == 1379
    for example in korsts.train:
        continue
    for example in korsts.dev:
        continue
    for example in korsts.test:
        continue

# %% set random seed
from ratsnlp import nlpbook
nlpbook.set_seed(args)

# %% set logger

nlpbook.set_logger(args)

# %% download corpus
from Korpora import Korpora

Korpora.fetch(
    args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=args.force_download,
    )

# %% prepare tokenizer
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    args.pretrained_model_name,
    eos_token="</s>",
)

# %% create train dataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from ratsnlp.nlpbook.generation import GenerationDataset, NsmcCorpus
예제 #19
0
#!/usr/bin/env python3

from Korpora import Korpora, QuestionPairKorpus
from konlpy.tag import Mecab

question_pair = Korpora.load('question_pair')
mecab = Mecab()

train_fn = 'qqp.train'
test_fn = 'qqp.test'


def get_morph_tag(text):
    morphs, tags = [], []
    for morph, tag in mecab.pos(text):
        morphs.append(morph)
        tags.append(tag)
    return morphs, tags


def analyze_and_save(fn, df):
    fp = open(fn, 'w')
    for idx, qpair in enumerate(df):
        text_morphs, text_tags = get_morph_tag(qpair.text)
        pair_morphs, pair_tags = get_morph_tag(qpair.pair)
        text_morphs, text_tags = ' '.join(text_morphs), ' '.join(text_tags)
        pair_morphs, pair_tags = ' '.join(pair_morphs), ' '.join(pair_tags)
        fp.writelines('{}\n'.format('\t'.join([
            qpair.text, qpair.pair, qpair.label, text_morphs, text_tags,
            pair_morphs, pair_tags
        ])))
예제 #20
0
def load_large_test(args):
    for corpus_name in ['kcbert', 'kowikitext', 'namuwikitext']:
        corpus = Korpora.load(corpus_name, root_dir=args.root_dir)
        bar = '=' * 80
        print(corpus, end=f'\n\n{bar}\n\n', flush=True)
        time.sleep(0.5)
예제 #21
0
def exists_test(args):
    for corpus_name in corpus_list:
        result = Korpora.exists(corpus_name, root_dir=args.root_dir)
        print(corpus_name, result)
        time.sleep(0.1)
 def __init__(self) -> None:
     self.nsmc = Korpora.load("nsmc", force_download=True)
     os.makedirs("nlpbook/bbpe", exist_ok=True)
     os.makedirs("nlpbook/wordpiece", exist_ok=True)
예제 #23
0
from Korpora import Korpora
import os

rawDataList = Korpora.corpus_list()
for key in rawDataList:
    print(key, ": ", rawDataList[key])

target = input("저장할 말 뭉치를 위에서 고르세요: ")
Korpora.fetch(target)
print(
    os.system("mv ~/Korpora/" + target +
              "/*.txt ~/PycharmProjects/ginger/localdata/raw/ko/raw" + target +
              ".txt"))
print(os.system("rm -rf ~/Korpora/*"))
예제 #24
0
 def get_corpus_specifications(self):
     from Korpora import Korpora
     for name, desc in Korpora.corpus_list().items():
         print("{:<40}  {:<}".format(name, desc))
예제 #25
0
import argparse
import time
import sys, os
from contextlib import contextmanager
from Korpora import (Korpora, ModuNewsKorpus, ModuMessengerKorpus,
                     ModuMorphemeKorpus, ModuNEKorpus, ModuSpokenKorpus,
                     ModuWebKorpus, ModuWrittenKorpus)

corpus_list = Korpora.corpus_list()


@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout


def fetch_test(args):
    exclusive_fetch_test = {
        'aihub_conversation_translation', 'aihub_decree_translation',
        'aihub_government_website_translation',
        'aihub_korean_culture_translation', 'aihub_news_translation',
        'aihub_spoken_translation', 'aihub_translation', 'modu_messenger',
        'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web',
        'modu_written'
    }
    epochs=3,
    #tpu_cores=0 if torch.cuda.is_available() else 8,
    tpu_cores=0,
    seed=7,
)

# %% set random seed
nlpbook.set_seed(args)

# %% set logger
nlpbook.set_logger(args)

# %% download corpus
Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=True,
)

# %% Tokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)

# %% create train dataset
corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
예제 #27
0
 def build_corpus(self, corpus_name):
     from Korpora import Korpora
     return Korpora.load(corpus_name)