Ejemplo n.º 1
0
    def __init__(
        self,
        knp_kwargs: Optional[Dict[str, str]] = None,
        preprocessor: Callable[[str], str] = None,
    ):
        import pyknp

        cmd = get_juman_command()
        assert cmd
        knp_kwargs = knp_kwargs or {}
        knp_kwargs.setdefault("jumancommand", cmd)

        self.knp = pyknp.KNP(**knp_kwargs)
        self.knp_kwargs = knp_kwargs
Ejemplo n.º 2
0
    def __init__(
        self,
        cls: Type["Defaults"],
        nlp: Optional[Language] = None,
        juman_kwargs: Optional[Dict[str, str]] = None,
        preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize,
    ):
        """

        Args:
            juman_kwargs: passed to `pyknp.Juman.__init__`
            preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used.
        """
        from pyknp import Juman

        juman_kwargs = juman_kwargs or {}
        default_command = get_juman_command()
        assert default_command
        juman_kwargs.setdefault("command", default_command)

        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman()
        self.juman_kwargs = juman_kwargs
        self.preprocessor = preprocessor