def test_add_pre_analyzed_word(): kiwi = Kiwi() ores = kiwi.tokenize("팅겼어") try: kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV"), "었/EP", "어/EF"]) raise AssertionError("expected to raise `ValueError`") except ValueError: pass except: raise AssertionError("expected to raise `ValueError`") kiwi.add_user_word("팅기", "VV", orig_word="튕기") kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV", 0, 2), ("었", "EP", 1, 2), ("어", "EF", 2, 3)]) res = kiwi.tokenize("팅겼어...") assert res[0].form == "팅기" and res[0].tag == "VV" and res[ 0].start == 0 and res[0].end == 2 assert res[1].form == "었" and res[1].tag == "EP" and res[ 1].start == 1 and res[1].end == 2 assert res[2].form == "어" and res[2].tag == "EF" and res[ 2].start == 2 and res[2].end == 3 assert res[3].form == "..." and res[3].tag == "SF" and res[ 3].start == 3 and res[3].end == 6
def test_tokenize(): kiwi = Kiwi() text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^" tokens = kiwi.tokenize(text, normalize_coda=True) print(tokens) tokens_by_sent = kiwi.tokenize(text, normalize_coda=True, split_sents=True) for tokens in tokens_by_sent: print(tokens)
def test_tokenize_with_stopwords(): kiwi = Kiwi() stopwords = Stopwords() tokens = kiwi.tokenize("[^^ 우리는 강아지를 좋아한다.]", stopwords=stopwords) assert tokens[0].form == '강아지' assert tokens[1].form == '좋아하'
class KiwiModel(Model): def __init__(self): import kiwipiepy from kiwipiepy import Kiwi print("Initialize kiwipiepy ({})".format(kiwipiepy.__version__), file=sys.stderr) self._mdl = Kiwi() def _convert(self, morph): return morph.form, (morph.tag[:2] if morph.tag.startswith('V') else morph.tag[:1]) def _tokenize(self, text): return self._mdl.tokenize(text)
def baseline_splitter(text): import re sents = re.split(r'(?<=[.!?])\s', text) return sents if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('datasets', nargs='+') parser.add_argument('--write_result') parser.add_argument('--write_err') args = parser.parse_args() print('======== Baseline Splitter ========') for dataset in args.datasets: run_evaluate(dataset, baseline_splitter) print('======== Kiwi.split_into_sents ========') from kiwipiepy import Kiwi kiwi = Kiwi() kiwi.tokenize("foo-bar") # warm-up for dataset in args.datasets: run_evaluate( dataset, lambda text: [ sent.text for sent in kiwi.split_into_sents(text, normalize_coda=True) ], args.write_result, args.write_err)