def __init__( self, knp_kwargs: Optional[Dict[str, str]] = None, preprocessor: Callable[[str], str] = None, ): import pyknp cmd = get_juman_command() assert cmd knp_kwargs = knp_kwargs or {} knp_kwargs.setdefault("jumancommand", cmd) self.knp = pyknp.KNP(**knp_kwargs) self.knp_kwargs = knp_kwargs
def __init__( self, cls: Type["Defaults"], nlp: Optional[Language] = None, juman_kwargs: Optional[Dict[str, str]] = None, preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize, ): """ Args: juman_kwargs: passed to `pyknp.Juman.__init__` preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used. """ from pyknp import Juman juman_kwargs = juman_kwargs or {} default_command = get_juman_command() assert default_command juman_kwargs.setdefault("command", default_command) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman() self.juman_kwargs = juman_kwargs self.preprocessor = preprocessor