class KNP(object): """ KNP を用いて構文解析を行うモジュールである. """ def __init__(self, command='knp', option='-tab', rcfile='', server=None, port=31000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)', jumanrcfile='', juman_option='-e2 -B', juman_port=32000, juman_command='juman', jumanpp=False): self.use_jumanpp = (juman_command == "jumanpp") or jumanpp assert 'EOS' in pattern self.pattern = pattern self.EOS = 'EOS' # tab形式しかパースしない assert '-tab' in option if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) # Setup Juman(++) assert port != juman_port juman_args = {'option': juman_option, 'rcfile': jumanrcfile, 'server':server, 'port':juman_port} if self.use_jumanpp: self.juman = Jumanpp(**juman_args) else: self.juman = Juman(**juman_args) # Setup KNP if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option += " -r {}".format(rcfile) self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern) def parse_sentence(self, sentence): assert isinstance(sentence, str) juman_lines = self.juman.juman_lines(sentence) if self.EOS not in juman_lines: juman_lines += self.EOS return self.query(juman_lines) def knp(self, sentence): assert isinstance(sentence, str) result = BList(self.parse_sentence(sentence)) return result def parse(self, sentence): """ 文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す. """ return self.knp(sentence) def result(self, input_str): return BList(input_str, self.EOS)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 入力された文字列に対して形態素解析と構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) return self.parse_juman_result(juman_str, juman_format) def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT): """ JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す Args: juman_str (str): ある文に関するJUMANの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.options if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % self.pattern) return BList(knp_lines, self.pattern, juman_format) def reparse_knp_result(self, knp_str, juman_format=JUMAN_FORMAT.DEFAULT): """ KNP出力結果に対してもう一度構文解析を行い、文節列オブジェクトを返す。 KNPのfeatureを再付与する場合などに用いる。中身はparse_juman_result関数と同じ。 Args: knp_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return self.parse_juman_result(knp_str, juman_format=juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)
class Tokenizer(SerializationMixin): """Juman tokenizer Note: `spacy.Token._.fstring` is set. The Juman's output is stored into it during tokenizing. """ serialization_fields = ["preprocessor", "juman_kwargs"] key_fstring = KEY_FSTRING @classmethod def install_extensions(cls): """See https://github.com/explosion/spacy-pytorch-transformers#extension-attributes.""" Token.set_extension(cls.key_fstring, default=None, force=True) def __init__( self, cls: Type["Defaults"], nlp: Optional[Language] = None, juman_kwargs: Optional[Dict[str, str]] = None, preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize, ): """ Args: juman_kwargs: passed to `pyknp.Juman.__init__` preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used. """ from pyknp import Juman juman_kwargs = juman_kwargs or {} default_command = get_juman_command() assert default_command juman_kwargs.setdefault("command", default_command) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman() self.juman_kwargs = juman_kwargs self.preprocessor = preprocessor def reset_tokenizer(self): from pyknp import Juman self.tokenizer = Juman( **self.juman_kwargs) if self.juman_kwargs else Juman() def __call__(self, text: str) -> Doc: """Make doc from text. Juman's `fstring` is stored in `Token._.fstring`""" if self.preprocessor: text = self.preprocessor(text) juman_lines = self._juman_string(text) dtokens = self._detailed_tokens(juman_lines) doc = self._dtokens_to_doc(dtokens) doc.user_data[JUMAN_LINES] = juman_lines return doc def _juman_string(self, text: str) -> str: try: texts = _split_text_for_juman(text) lines: str = "".join( itertools.chain.from_iterable( self.tokenizer.juman_lines(text) for text in texts)) except BrokenPipeError: # Juman is sometimes broken due to its subprocess management. self.reset_tokenizer() lines = self.tokenizer.juman_lines(text) return lines def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc: words = [x.surface for x in dtokens] spaces = [x.space for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) for token, dtoken in zip(doc, dtokens): token.lemma_ = dtoken.lemma token.tag_ = dtoken.pos token._.set(self.key_fstring, dtoken.fstring) doc.is_tagged = True return doc def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]: """Tokenize text with Juman and format the outputs for further processing""" from pyknp import MList ml = MList(juman_lines).mrph_list() words: List[ShortUnitWord] = [] for m in ml: surface = m.midasi pos = m.hinsi + "," + m.bunrui lemma = m.genkei or surface words.append(ShortUnitWord(surface, lemma, pos, m.fstring, False)) return words
from pyknp import KNP, Juman line = "猫が好き。犬も好き" jumanpp = Juman() knp = KNP() juman_line = jumanpp.juman_lines(line) result = knp.parse(line) print("基本句") for tag in result.tag_list(): # 各基本句へのアクセス print("\t見出し:%s, 素性:%s" % ("".join(mrph.midasi for mrph in tag.mrph_list()), tag.fstring))
class KNP(object): """ KNP を用いて構文解析を行うモジュールである. """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='juman', jumanrcfile='', jumanpp=False): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = (jumancommand == "jumanpp") or jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile) quit(1) if (self.jumanpp): self.juman = Jumanpp() else: self.juman = Juman(command=jumancommand, rcfile=jumanrcfile) def knp(self, sentence): self.parse(sentence) def parse(self, sentence): """ 文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す. """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=self.pattern) return BList(knp_lines, self.pattern) def result(self, input_str): return BList(input_str, self.pattern)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(command) is None: raise Exception("Can't find KNP command: %s" % command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 Returns: BList: 文節列オブジェクト """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % (self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % (self.pattern)) return BList(knp_lines, self.pattern) def result(self, input_str): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern)
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern, juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)
class Tokenizer(SerializationMixin): """Juman tokenizer Note: `spacy.Token._.fstring` is set. The Juman's output is stored into it during tokenizing. """ serialization_fields = ["preprocessor", "juman_kwargs"] KEY_FSTRING = "juman_fstring" @classmethod def get_juman_fstring(cls, e: UserDataProto) -> str: if cls.KEY_FSTRING not in e.user_data: raise ValueError(f"{cls.KEY_FSTRING} is not set in {e}") return e.user_data[cls.KEY_FSTRING] @classmethod def set_juman_fstring(cls, e: UserDataProto, fstring: str): e.user_data[cls.KEY_FSTRING] = fstring def __init__( self, juman_kwargs: Optional[Dict[str, Any]] = None, preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize, ): """ Args: juman_kwargs: passed to `pyknp.Juman.__init__` preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used. """ juman_kwargs = juman_kwargs or {} default_command = get_juman_command() assert default_command juman_kwargs.setdefault("command", default_command) self.juman_kwargs = juman_kwargs self.preprocessor = preprocessor self.set_tokenizer() def set_tokenizer(self): from pyknp import Juman self.tokenizer = Juman( **self.juman_kwargs) if self.juman_kwargs else Juman() def __call__(self, text: str) -> Doc: """Make doc from text. Juman's `fstring` is stored in `Token._.fstring`""" if self.preprocessor: text = self.preprocessor(text) juman_lines = self._juman_parse(text) dtokens = self._detailed_tokens(juman_lines) doc = self._dtokens_to_doc(dtokens) self.set_juman_fstring(doc, juman_lines) return doc def _juman_parse(self, text: str) -> str: texts = _split_text_for_juman(text) while True: try: lines: str = "".join( itertools.chain.from_iterable( self.tokenizer.juman_lines(text) for text in texts # type: ignore )) break except BrokenPipeError: # Juman is sometimes broken due to its subprocess management. self.set_tokenizer() return lines def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc: words = [x.surface + x.space for x in dtokens] doc = Doc.from_words(words) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma self.set_juman_fstring(token, dtoken.fstring) return doc def _detailed_tokens(self, juman_lines: str) -> List[ShortUnitWord]: """Tokenize text with Juman and format the outputs for further processing""" from pyknp import MList, Morpheme # type: ignore ml: List[Morpheme] = MList(juman_lines).mrph_list() words: List[ShortUnitWord] = [] for m in ml: surface: str = m.midasi # type: ignore pos: str = m.hinsi + "," + m.bunrui # type: ignore lemma: str = m.genkei or surface # type: ignore words.append(ShortUnitWord(surface, lemma, pos, m.fstring, "")) # type: ignore return words