class KoreanTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") def __del__(self): self.mecab_tokenizer.__del__() def __call__(self, text): dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc def detailed_tokens(self, text): # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): if node.is_eos(): break surface = node.surface feature = node.feature tag, _, expr = feature.partition(",") lemma, _, remainder = expr.partition("/") if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag}
class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab MeCab = try_mecab_import() # type: ignore[func-returns-value] self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") def __reduce__(self): return KoreanTokenizer, (self.vocab,) def __del__(self): self.mecab_tokenizer.__del__() def __call__(self, text: str) -> Doc: dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) token.pos = TAG_MAP[token.tag_][POS] token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): if node.is_eos(): break surface = node.surface feature = node.feature tag, _, expr = feature.partition(",") lemma, _, remainder = expr.partition("/") if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} def score(self, examples): validate_examples(examples, "KoreanTokenizer.score") return Scorer.score_tokenization(examples)