def test(self): tlist = TList() tag1 = Tag(u"+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>" u"<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>") mrph1 = Morpheme(u"構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \"" u"代表表記:構文/こうぶん カテゴリ:抽象物\" " u"<代表表記:構文/こうぶん>") tag2 = Tag(u"+ -1D <BGH:解析/かいせき><文末><体言><用言:判>" u"<体言止><レベル:C>") mrph2 = Morpheme(u"解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \"" u"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;" u"科学・技術\" <代表表記:解析/かいせき>") # Add tag with included morpheme tag1.push_mrph(mrph1) tlist.push_tag(tag1) self.assertEqual(len(tlist), 1) self.assertEqual(len(tlist[0].mrph_list()), 1) # Add tag without morpheme tlist.push_tag(tag2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[1].mrph_list()), 0) # Add morpheme to second tag tlist.push_mrph(mrph2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[0].mrph_list()), 1) self.assertEqual(len(tlist[1].mrph_list()), 1)
def test(self): tlist = TList() tag1 = Tag("+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>" "<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>") mrph1 = Morpheme("構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \"" "代表表記:構文/こうぶん カテゴリ:抽象物\" " "<代表表記:構文/こうぶん>") tag2 = Tag("+ -1D <BGH:解析/かいせき><文末><体言><用言:判>" "<体言止><レベル:C>") mrph2 = Morpheme("解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \"" "代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;" "科学・技術\" <代表表記:解析/かいせき>") # Add tag with included morpheme tag1.push_mrph(mrph1) tlist.push_tag(tag1) self.assertEqual(len(tlist), 1) self.assertEqual(len(tlist[0].mrph_list()), 1) # Add tag without morpheme tlist.push_tag(tag2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[1].mrph_list()), 0) # Add morpheme to second tag tlist.push_mrph(mrph2) self.assertEqual(len(tlist), 2) self.assertEqual(len(tlist[0].mrph_list()), 1) self.assertEqual(len(tlist[1].mrph_list()), 1)
def parse_keigo(tag: Tag) -> str: if "敬語" not in tag.features: return "" sym = "" keigo = tag.features["敬語"] if keigo == "尊敬表現": for m in tag.mrph_list(): for r in re.findall("<([^<>]+)>", m.fstring): if r.startswith("尊敬動詞:"): sym = r.split(":")[1] elif keigo == "謙譲表現": for m in tag.mrph_list(): for r in re.findall("<([^<>]+)>", m.fstring): if r.startswith("謙譲動詞"): sym = r.split(":")[1] return f"{keigo}:{sym}"
def __init__(self, tag: Tag, dmid_offset: int, dtid: int, sid: str, doc_id: str, parent: Optional['BasePhrase'] = None, children: Optional[List['BasePhrase']] = None, ): """ Args: tag (Tag): Tag object in pyknp. dmid_offset (int): Document-wide morpheme ID of the previous morpheme. dtid (int): Document-wide tag ID. sid (str): Sentence ID. doc_id (str): Document ID. parent (Optional[BasePhrase]): Dependency parent. children (List[BasePhrase]): Dependency children. """ self.tag: Tag = tag self.dtid: int = dtid self.sid: str = sid self.doc_id: str = doc_id self._mrph2dmid: Dict[Morpheme, int] = {} dmid = dmid_offset for mrph in tag.mrph_list(): self._mrph2dmid[mrph] = dmid dmid += 1 self.content: Morpheme = self._get_content_word() self.content_dmid: int = self._mrph2dmid[self.content] self.parent: Optional['BasePhrase'] = parent self.children: List['BasePhrase'] = children if children is not None else []
def test_spec(self): bnst = Bunsetsu(self.bunsetsu_str) tag1 = Tag(self.tag1_str) mrph1 = Morpheme(self.mrph1_str) tag1.push_mrph(mrph1) bnst.push_tag(tag1) tag2 = Tag(self.tag2_str) mrph2 = Morpheme(self.mrph2_str) tag2.push_mrph(mrph2) bnst.push_tag(tag2) self.assertEqual(bnst.spec(), self.spec)
def _find_mrph_span(name: str, mrph_list: List[Morpheme], tag: Tag) -> Optional[range]: """nameにマッチする形態素の範囲を返す""" for i in range(len(tag.mrph_list())): end_mid = len(mrph_list) - i mrph_span = '' for mrph in reversed(mrph_list[:end_mid]): mrph_span = mrph.midasi + mrph_span if mrph_span == name: return range(mrph.mrph_id, end_mid) return None
def parse(self, spec): """ KNPの出力を読み取る Args: spec (str): KNP出力 """ for string in spec.split('\n'): if string.strip() == "": continue if string.startswith('#\t'): items = string.split("\t") if len(items) >= 3 and items[1] == "PAS": self._pinfos.append(items[2]) elif string.startswith('#'): if self.comment: self.comment += "\n" self.comment += string match = re.match(r'# S-ID: ?(\S*)( .+)?$', self.comment) if match: self.sid = match.group(1) if 'KNP++' in string and 'output:KNP' not in string: self.juman_format = JUMAN_FORMAT.LATTICE_TOP_ONE # TODO elif re.match(self.pattern, string): break elif string.startswith(';;'): raise Exception("Error: %s" % string) elif string.startswith('*'): bnst = Bunsetsu(string, len(self._bnst)) self._bnst.append(bnst) elif string.startswith('+'): if self.juman_format != JUMAN_FORMAT.DEFAULT: # TODO bnst = Bunsetsu(string, len(self._bnst), self.juman_format) self._bnst.append(bnst) self._bnst[-1].push_tag( Tag(string, len(self.tag_list()), self.juman_format)) elif string.startswith('!!'): synnodes = SynNodes(string) self._bnst[-1].tag_list().push_synnodes(synnodes) elif string.startswith('!') and not string.startswith('! ! !'): synnode = SynNode(string) self._bnst[-1].tag_list().push_synnode(synnode) elif string.startswith('EOS'): pass else: mrph = Morpheme(string, len(self.mrph_list()), self.juman_format) if len(self._bnst) == 0: bnst = Bunsetsu("*", len(self._bnst)) self._bnst.append(bnst) self._bnst[-1].push_mrph(mrph)
def _parse_spec(self, spec): for string in spec.split('\n'): if string.strip() == "": continue if string.startswith('#\t'): items = string.split("\t") if len(items) >= 3 and items[1] == "PAS": self._pinfos.append(items[2]) elif string.startswith('#'): self.comment += string self.comment += "\n" match = re.match(r'# S-ID:(.*?)[ $\n]', self.comment) if match: self.sid = match.group(1) if 'KNP++' in string: self.newstyle = True elif re.match(self.pattern, string): break elif string.startswith(';;'): sys.stderr.write("Error: %s\n" % string) quit(1) elif string.startswith('*'): bnst = Bunsetsu(string, len(self._bnst)) self._bnst.append(bnst) elif string.startswith('+'): if self.newstyle: bnst = Bunsetsu(string, len(self._bnst), self.newstyle) self._bnst.append(bnst) self._bnst[-1].push_tag( Tag(string, len(self.tag_list()), self.newstyle)) elif string.startswith('!!'): synnodes = SynNodes(string) self._bnst[-1].tag_list().push_synnodes(synnodes) elif string.startswith('!') and not string.startswith('! ! !'): synnode = SynNode(string) self._bnst[-1].tag_list().push_synnode(synnode) elif string.startswith('EOS'): pass else: mrph = Morpheme(string, len(self.mrph_list()), self.newstyle) if not self._bnst: bnst = Bunsetsu("*", len(self._bnst)) self._bnst.append(bnst) self._bnst[-1].push_mrph(mrph)
def __init__( self, tag: Tag, dmid_offset: int, dtid: int, sid: str, doc_id: str, parent: Optional['BasePhrase'] = None, children: Optional[List['BasePhrase']] = None, ): """ Args: tag (Tag): KNPの基本句オブジェクト dmid_offset (int): 文書中でこの基本句が始まるまでの文書レベル形態素ID dtid (int): 文書レベル基本句ID sid (str): 自身を含む文の文ID doc_id (str): 自身を含む文書の文書ID parent (Optional[BasePhrase]): 係り先 children (List[BasePhrase]): 係り元 """ self.tag: Tag = tag self.dtid: int = dtid self.sid: str = sid self.doc_id: str = doc_id self._mrph2dmid: Dict[Morpheme, int] = {} dmid = dmid_offset for mrph in tag.mrph_list(): self._mrph2dmid[mrph] = dmid dmid += 1 self.content: Morpheme = self._get_content_word() self.content_dmid: int = self._mrph2dmid[self.content] self.parent: Optional['BasePhrase'] = parent self.children: List[ 'BasePhrase'] = children if children is not None else []