def fetch_test(args): exclusive_fetch_test = { 'aihub_conversation_translation', 'aihub_decree_translation', 'aihub_government_website_translation', 'aihub_korean_culture_translation', 'aihub_news_translation', 'aihub_spoken_translation', 'aihub_translation', 'modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written' } for corpus_name in corpus_list: if corpus_name in exclusive_fetch_test: continue Korpora.fetch(corpus_name, root_dir=args.root_dir) time.sleep(0.5)
def load_modu_test(args): for corpus_name in ['modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written']: with suppress_stdout(): corpus = Korpora.load(corpus_name, root_dir=args.root_dir) bar = '=' * 80 print(corpus, end=f'\n\n{bar}\n\n', flush=True) time.sleep(0.5)
def test_usage(): ner = Korpora.load('naver_changwon_ner') assert len(ner.train) == len(NaverChangwonNERKorpus().train) ner.train[0] assert len(ner.train) == 90000 for example in ner.train: continue
def best_layer(args): print( f'Finding best performance BERT layer with {args.model_name_or_path}') if not -1 < args.rescale_base < 1: raise ValueError("`rescale_base` must be in [-1, 1]") if args.draw_plot and args.output_dir is None: raise ValueError('Set `output_dir` when use `draw_plot`') device = args.device if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" # Load pretrained BERT model and tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) encoder = BertModel.from_pretrained(args.model_name_or_path).to(device) # Load STS corpus corpus = Korpora.load('korsts') references = [] candidates = [] qualities = [] for data in (corpus.train, corpus.dev): for example in data: references.append(example.text) candidates.append(example.pair) qualities.append(example.label) # Train IDF idf = train_idf(tokenizer, references) idf_embed = idf_numpy_to_embed(idf) # Find best layer model_name = args.model_name_or_path.split(os.path.sep)[-1] best_layer, informations = find_best_layer(tokenizer, encoder, references, candidates, qualities, idf=idf_embed, rescale_base=args.rescale_base, model_name=model_name, batch_size=args.batch_size) print(f' - Best performance layer : {best_layer}') # Save figures if args.draw_plot: output_dir = args.output_dir if output_dir is None: output_dir = os.getcwd() dirname = os.path.abspath(output_dir) print(f'Saving figures at {dirname}') os.makedirs(dirname, exist_ok=True) warnings.filterwarnings("ignore") save(informations['figures']['R'], f'{dirname}/R.html') save(informations['figures']['P'], f'{dirname}/P.html') save(informations['figures']['F'], f'{dirname}/F.html') save(informations['figures']['RPF'], f'{dirname}/RPF.html')
def test_usage(): ner = Korpora.load('naver_changwon_ner') assert len(ner.train) == len(NaverChangwonNERKorpus().train) ner.train[0] assert len(ner.train) == 90000 print(f'str(korpus)\n{str(ner)}') print(f'str(korpus.train)\n{str(ner.train)}') for example in ner.train: continue
def test_usage(): kornli = Korpora.load('kornli') assert len(kornli.snli_train) == len(KorNLIKorpus().snli_train) kornli.snli_train[0] assert len(kornli.snli_train) == 550152 assert len(kornli.xnli_dev) == 2490 assert len(kornli.xnli_test) == 5010 assert len(kornli.multinli_train) == 392702 for example in kornli.snli_train: continue
def test_usage(): korean_hate_speech = Korpora.load('korean_hate_speech') korean_hate_speech_ = KoreanHateSpeechKorpus() assert len(korean_hate_speech.unlabeled) == len(korean_hate_speech_.unlabeled) == 2033893 assert len(korean_hate_speech.test) == 974 assert len(korean_hate_speech.dev) == 471 assert len(korean_hate_speech.train) == 7896 for data in [korean_hate_speech.train, korean_hate_speech.dev, korean_hate_speech.test, korean_hate_speech.unlabeled]: for _ in data: continue
def test_usage(): koen_news = Korpora.load('korean_parallel_koen_news') koen_news_ = KoreanParallelKOENNewsKorpus() assert len(koen_news.train) == len(koen_news_.train) == 94123 assert len(koen_news.dev) == 1000 assert len(koen_news.test) == 2000 print(f'str(korpus)\n{str(koen_news)}') print(f'str(korpus.train)\n{str(koen_news.train)}') for data in [koen_news.train, koen_news.dev, koen_news.test]: for _ in data: continue
def test_usage(): petitions = Korpora.load('korean_petitions') assert len(petitions.train) == len(KoreanPetitionsKorpus().train) assert len(petitions.train) == 433631 assert len(petitions.train[0].text) == 1491 assert petitions.train[0].begin == '2017-08-25' assert petitions.train[0].end == '2017-09-24' assert petitions.train.titles[0] == petitions.train[0].title assert '청와대 국민청원' in petitions.description assert 'CC0 1.0 Universal' in petitions.license for petition in petitions.train: continue
def test_usage(): nsmc = Korpora.load('nsmc') assert len(nsmc.train.texts) == 150000 assert len(nsmc.train.labels) == 150000 assert len(nsmc.test.texts) == 50000 assert len(nsmc.test.labels) == 50000 assert 'Naver sentiment movie corpus v1.0' in nsmc.description assert 'CC0 1.0 Universal' in nsmc.license assert len(NSMCKorpus().train) == len(nsmc.train) nsmc.train[0] for row in nsmc.train: continue
def load_small_test(args): exclusive_load_test = { 'kcbert', 'kowikitext', 'namuwikitext', 'modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written' } for corpus_name in corpus_list: if corpus_name in exclusive_load_test: continue with suppress_stdout(): corpus = Korpora.load(corpus_name, root_dir=args.root_dir) bar = '=' * 80 print(corpus, end=f'\n\n{bar}\n\n', flush=True) time.sleep(0.5)
def test_usage(): pair = Korpora.load('question_pair') assert len(pair.train) == len(QuestionPairKorpus().train) pair.train[0] pair.test[0] assert len(pair.train) == 6888 assert len(pair.test) == 688 assert len(pair.get_all_pairs()) == 7576 assert len(pair.get_all_labels()) == 7576 for example in pair.train: continue for example in pair.test: continue
def test_usage(): kornli = Korpora.load('kornli') assert kornli.exists() assert len(kornli.snli_train) == len(KorNLIKorpus().snli_train) kornli.snli_train[0] assert len(kornli.snli_train) == 550152 assert len(kornli.xnli_dev) == 2490 assert len(kornli.xnli_test) == 5010 assert len(kornli.multinli_train) == 392702 print(f'str(korpus)\n{str(kornli)}') print(f'str(korpus.multinli_train)\n{str(kornli.multinli_train)}') for example in kornli.snli_train: continue
def test_usage(): korean_hate_speech = Korpora.load('korean_hate_speech') korean_hate_speech_ = KoreanHateSpeechKorpus() assert korean_hate_speech.exists() assert korean_hate_speech_.exists() assert len(korean_hate_speech.unlabeled) == len(korean_hate_speech_.unlabeled) == 2033893 assert len(korean_hate_speech.test) == 974 assert len(korean_hate_speech.dev) == 471 assert len(korean_hate_speech.train) == 7896 print(f'str(korpus)\n{str(korean_hate_speech)}') print(f'str(korpus.train)\n{str(korean_hate_speech.train)}') for data in [korean_hate_speech.train, korean_hate_speech.dev, korean_hate_speech.test, korean_hate_speech.unlabeled]: for _ in data: continue
def test_usage(): chatbot_data = Korpora.load('korean_chatbot_data') assert len(chatbot_data.train) == len(KoreanChatbotKorpus().train) assert len(chatbot_data.train.texts) == 11823 assert len(chatbot_data.train.pairs) == 11823 assert len(chatbot_data.train.labels) == 11823 assert len(chatbot_data.get_all_texts()) == 11823 assert len(chatbot_data.get_all_pairs()) == 11823 assert len(chatbot_data.get_all_labels()) == 11823 assert 'Chatbot_data_for_Korean v1.0' in chatbot_data.description assert 'CC0 1.0 Universal' in chatbot_data.license chatbot_data.train[0] for example in chatbot_data.train: continue
def test_usage(): pair = Korpora.load('question_pair') assert pair.exists() assert len(pair.train) == len(QuestionPairKorpus().train) pair.train[0] pair.test[0] assert len(pair.train) == 6888 assert len(pair.test) == 688 assert len(pair.get_all_pairs()) == 7576 assert len(pair.get_all_labels()) == 7576 print(f'str(korpus)\n{str(pair)}') print(f'str(korpus.train)\n{str(pair.train)}') for example in pair.train: continue for example in pair.test: continue
def test_usage(): korsts = Korpora.load('korsts') assert len(korsts.train) == len(KorSTSKorpus().train) korsts.train[0] korsts.dev[0] korsts.test[0] assert len(korsts.train) == len(korsts.train.get_all_pairs()) == \ len(korsts.train.get_all_labels()) == len(korsts.train.get_all_genres()) == \ len(korsts.train.get_all_filenames()) == len(korsts.train.get_all_years()) == 5749 assert len(korsts.dev) == len(korsts.dev.get_all_pairs()) == \ len(korsts.dev.get_all_labels()) == len(korsts.dev.get_all_genres()) == \ len(korsts.dev.get_all_filenames()) == len(korsts.dev.get_all_years()) == 1500 assert len(korsts.test) == len(korsts.test.get_all_pairs()) == \ len(korsts.test.get_all_labels()) == len(korsts.test.get_all_genres()) == \ len(korsts.test.get_all_filenames()) == len(korsts.test.get_all_years()) == 1379 for example in korsts.train: continue for example in korsts.dev: continue for example in korsts.test: continue
# %% set random seed from ratsnlp import nlpbook nlpbook.set_seed(args) # %% set logger nlpbook.set_logger(args) # %% download corpus from Korpora import Korpora Korpora.fetch( args.downstream_corpus_name, root_dir=args.downstream_corpus_root_dir, force_download=args.force_download, ) # %% prepare tokenizer from transformers import PreTrainedTokenizerFast tokenizer = PreTrainedTokenizerFast.from_pretrained( args.pretrained_model_name, eos_token="</s>", ) # %% create train dataset from torch.utils.data import DataLoader, SequentialSampler, RandomSampler from ratsnlp.nlpbook.generation import GenerationDataset, NsmcCorpus
#!/usr/bin/env python3 from Korpora import Korpora, QuestionPairKorpus from konlpy.tag import Mecab question_pair = Korpora.load('question_pair') mecab = Mecab() train_fn = 'qqp.train' test_fn = 'qqp.test' def get_morph_tag(text): morphs, tags = [], [] for morph, tag in mecab.pos(text): morphs.append(morph) tags.append(tag) return morphs, tags def analyze_and_save(fn, df): fp = open(fn, 'w') for idx, qpair in enumerate(df): text_morphs, text_tags = get_morph_tag(qpair.text) pair_morphs, pair_tags = get_morph_tag(qpair.pair) text_morphs, text_tags = ' '.join(text_morphs), ' '.join(text_tags) pair_morphs, pair_tags = ' '.join(pair_morphs), ' '.join(pair_tags) fp.writelines('{}\n'.format('\t'.join([ qpair.text, qpair.pair, qpair.label, text_morphs, text_tags, pair_morphs, pair_tags ])))
def load_large_test(args): for corpus_name in ['kcbert', 'kowikitext', 'namuwikitext']: corpus = Korpora.load(corpus_name, root_dir=args.root_dir) bar = '=' * 80 print(corpus, end=f'\n\n{bar}\n\n', flush=True) time.sleep(0.5)
def exists_test(args): for corpus_name in corpus_list: result = Korpora.exists(corpus_name, root_dir=args.root_dir) print(corpus_name, result) time.sleep(0.1)
def __init__(self) -> None: self.nsmc = Korpora.load("nsmc", force_download=True) os.makedirs("nlpbook/bbpe", exist_ok=True) os.makedirs("nlpbook/wordpiece", exist_ok=True)
from Korpora import Korpora import os rawDataList = Korpora.corpus_list() for key in rawDataList: print(key, ": ", rawDataList[key]) target = input("저장할 말 뭉치를 위에서 고르세요: ") Korpora.fetch(target) print( os.system("mv ~/Korpora/" + target + "/*.txt ~/PycharmProjects/ginger/localdata/raw/ko/raw" + target + ".txt")) print(os.system("rm -rf ~/Korpora/*"))
def get_corpus_specifications(self): from Korpora import Korpora for name, desc in Korpora.corpus_list().items(): print("{:<40} {:<}".format(name, desc))
import argparse import time import sys, os from contextlib import contextmanager from Korpora import (Korpora, ModuNewsKorpus, ModuMessengerKorpus, ModuMorphemeKorpus, ModuNEKorpus, ModuSpokenKorpus, ModuWebKorpus, ModuWrittenKorpus) corpus_list = Korpora.corpus_list() @contextmanager def suppress_stdout(): with open(os.devnull, "w") as devnull: old_stdout = sys.stdout sys.stdout = devnull try: yield finally: sys.stdout = old_stdout def fetch_test(args): exclusive_fetch_test = { 'aihub_conversation_translation', 'aihub_decree_translation', 'aihub_government_website_translation', 'aihub_korean_culture_translation', 'aihub_news_translation', 'aihub_spoken_translation', 'aihub_translation', 'modu_messenger', 'modu_mp', 'modu_ne', 'modu_news', 'modu_spoken', 'modu_web', 'modu_written' }
epochs=3, #tpu_cores=0 if torch.cuda.is_available() else 8, tpu_cores=0, seed=7, ) # %% set random seed nlpbook.set_seed(args) # %% set logger nlpbook.set_logger(args) # %% download corpus Korpora.fetch( corpus_name=args.downstream_corpus_name, root_dir=args.downstream_corpus_root_dir, force_download=True, ) # %% Tokenizer tokenizer = BertTokenizer.from_pretrained( args.pretrained_model_name, do_lower_case=False, ) # %% create train dataset corpus = NsmcCorpus() train_dataset = ClassificationDataset( args=args, corpus=corpus, tokenizer=tokenizer,
def build_corpus(self, corpus_name): from Korpora import Korpora return Korpora.load(corpus_name)