Ejemplo n.º 1
0
 def test(self):
     tlist = TList()
     tag1 = Tag(u"+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>"
                u"<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>")
     mrph1 = Morpheme(u"構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \""
                      u"代表表記:構文/こうぶん カテゴリ:抽象物\" "
                      u"<代表表記:構文/こうぶん>")
     tag2 = Tag(u"+ -1D <BGH:解析/かいせき><文末><体言><用言:判>" u"<体言止><レベル:C>")
     mrph2 = Morpheme(u"解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \""
                      u"代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;"
                      u"科学・技術\" <代表表記:解析/かいせき>")
     # Add tag with included morpheme
     tag1.push_mrph(mrph1)
     tlist.push_tag(tag1)
     self.assertEqual(len(tlist), 1)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     # Add tag without morpheme
     tlist.push_tag(tag2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[1].mrph_list()), 0)
     # Add morpheme to second tag
     tlist.push_mrph(mrph2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     self.assertEqual(len(tlist[1].mrph_list()), 1)
Ejemplo n.º 2
0
 def test(self):
     tlist = TList()
     tag1 = Tag("+ 1D <BGH:構文/こうぶん><文節内><係:文節内><文頭>"
                "<体言><名詞項候補><先行詞候補><正規化代表表記:構文/こうぶん>")
     mrph1 = Morpheme("構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 \""
                      "代表表記:構文/こうぶん カテゴリ:抽象物\" "
                      "<代表表記:構文/こうぶん>")
     tag2 = Tag("+ -1D <BGH:解析/かいせき><文末><体言><用言:判>"
                "<体言止><レベル:C>")
     mrph2 = Morpheme("解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 \""
                      "代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;"
                      "科学・技術\" <代表表記:解析/かいせき>")
     # Add tag with included morpheme
     tag1.push_mrph(mrph1)
     tlist.push_tag(tag1)
     self.assertEqual(len(tlist), 1)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     # Add tag without morpheme
     tlist.push_tag(tag2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[1].mrph_list()), 0)
     # Add morpheme to second tag
     tlist.push_mrph(mrph2)
     self.assertEqual(len(tlist), 2)
     self.assertEqual(len(tlist[0].mrph_list()), 1)
     self.assertEqual(len(tlist[1].mrph_list()), 1)
Ejemplo n.º 3
0
 def parse_keigo(tag: Tag) -> str:
     if "敬語" not in tag.features:
         return ""
     sym = ""
     keigo = tag.features["敬語"]
     if keigo == "尊敬表現":
         for m in tag.mrph_list():
             for r in re.findall("<([^<>]+)>", m.fstring):
                 if r.startswith("尊敬動詞:"):
                     sym = r.split(":")[1]
     elif keigo == "謙譲表現":
         for m in tag.mrph_list():
             for r in re.findall("<([^<>]+)>", m.fstring):
                 if r.startswith("謙譲動詞"):
                     sym = r.split(":")[1]
     return f"{keigo}:{sym}"
Ejemplo n.º 4
0
    def __init__(self,
                 tag: Tag,
                 dmid_offset: int,
                 dtid: int,
                 sid: str,
                 doc_id: str,
                 parent: Optional['BasePhrase'] = None,
                 children: Optional[List['BasePhrase']] = None,
                 ):
        """

        Args:
            tag (Tag): Tag object in pyknp.
            dmid_offset (int): Document-wide morpheme ID of the previous morpheme.
            dtid (int): Document-wide tag ID.
            sid (str): Sentence ID.
            doc_id (str): Document ID.
            parent (Optional[BasePhrase]): Dependency parent.
            children (List[BasePhrase]): Dependency children.
        """
        self.tag: Tag = tag
        self.dtid: int = dtid
        self.sid: str = sid
        self.doc_id: str = doc_id

        self._mrph2dmid: Dict[Morpheme, int] = {}
        dmid = dmid_offset
        for mrph in tag.mrph_list():
            self._mrph2dmid[mrph] = dmid
            dmid += 1

        self.content: Morpheme = self._get_content_word()
        self.content_dmid: int = self._mrph2dmid[self.content]
        self.parent: Optional['BasePhrase'] = parent
        self.children: List['BasePhrase'] = children if children is not None else []
Ejemplo n.º 5
0
 def test_spec(self):
     bnst = Bunsetsu(self.bunsetsu_str)
     tag1 = Tag(self.tag1_str)
     mrph1 = Morpheme(self.mrph1_str)
     tag1.push_mrph(mrph1)
     bnst.push_tag(tag1)
     tag2 = Tag(self.tag2_str)
     mrph2 = Morpheme(self.mrph2_str)
     tag2.push_mrph(mrph2)
     bnst.push_tag(tag2)
     self.assertEqual(bnst.spec(), self.spec)
Ejemplo n.º 6
0
 def _find_mrph_span(name: str, mrph_list: List[Morpheme],
                     tag: Tag) -> Optional[range]:
     """nameにマッチする形態素の範囲を返す"""
     for i in range(len(tag.mrph_list())):
         end_mid = len(mrph_list) - i
         mrph_span = ''
         for mrph in reversed(mrph_list[:end_mid]):
             mrph_span = mrph.midasi + mrph_span
             if mrph_span == name:
                 return range(mrph.mrph_id, end_mid)
     return None
Ejemplo n.º 7
0
    def parse(self, spec):
        """ KNPの出力を読み取る 

        Args:
            spec (str): KNP出力
        """
        for string in spec.split('\n'):
            if string.strip() == "":
                continue
            if string.startswith('#\t'):
                items = string.split("\t")
                if len(items) >= 3 and items[1] == "PAS":
                    self._pinfos.append(items[2])
            elif string.startswith('#'):
                if self.comment:
                    self.comment += "\n"
                self.comment += string
                match = re.match(r'# S-ID: ?(\S*)( .+)?$', self.comment)
                if match:
                    self.sid = match.group(1)
                if 'KNP++' in string and 'output:KNP' not in string:
                    self.juman_format = JUMAN_FORMAT.LATTICE_TOP_ONE  # TODO
            elif re.match(self.pattern, string):
                break
            elif string.startswith(';;'):
                raise Exception("Error: %s" % string)
            elif string.startswith('*'):
                bnst = Bunsetsu(string, len(self._bnst))
                self._bnst.append(bnst)
            elif string.startswith('+'):
                if self.juman_format != JUMAN_FORMAT.DEFAULT:  # TODO
                    bnst = Bunsetsu(string, len(self._bnst), self.juman_format)
                    self._bnst.append(bnst)
                self._bnst[-1].push_tag(
                    Tag(string, len(self.tag_list()), self.juman_format))
            elif string.startswith('!!'):
                synnodes = SynNodes(string)
                self._bnst[-1].tag_list().push_synnodes(synnodes)
            elif string.startswith('!') and not string.startswith('! ! !'):
                synnode = SynNode(string)
                self._bnst[-1].tag_list().push_synnode(synnode)
            elif string.startswith('EOS'):
                pass
            else:
                mrph = Morpheme(string, len(self.mrph_list()),
                                self.juman_format)
                if len(self._bnst) == 0:
                    bnst = Bunsetsu("*", len(self._bnst))
                    self._bnst.append(bnst)
                self._bnst[-1].push_mrph(mrph)
Ejemplo n.º 8
0
 def test_spec(self):
     bnst = Bunsetsu(self.bunsetsu_str)
     tag1 = Tag(self.tag1_str)
     mrph1 = Morpheme(self.mrph1_str)
     tag1.push_mrph(mrph1)
     bnst.push_tag(tag1)
     tag2 = Tag(self.tag2_str)
     mrph2 = Morpheme(self.mrph2_str)
     tag2.push_mrph(mrph2)
     bnst.push_tag(tag2)
     self.assertEqual(bnst.spec(), self.spec)
Ejemplo n.º 9
0
 def _parse_spec(self, spec):
     for string in spec.split('\n'):
         if string.strip() == "":
             continue
         if string.startswith('#\t'):
             items = string.split("\t")
             if len(items) >= 3 and items[1] == "PAS":
                 self._pinfos.append(items[2])
         elif string.startswith('#'):
             self.comment += string
             self.comment += "\n"
             match = re.match(r'# S-ID:(.*?)[ $\n]', self.comment)
             if match:
                 self.sid = match.group(1)
             if 'KNP++' in string:
                 self.newstyle = True
         elif re.match(self.pattern, string):
             break
         elif string.startswith(';;'):
             sys.stderr.write("Error: %s\n" % string)
             quit(1)
         elif string.startswith('*'):
             bnst = Bunsetsu(string, len(self._bnst))
             self._bnst.append(bnst)
         elif string.startswith('+'):
             if self.newstyle:
                 bnst = Bunsetsu(string, len(self._bnst), self.newstyle)
                 self._bnst.append(bnst)
             self._bnst[-1].push_tag(
                 Tag(string, len(self.tag_list()), self.newstyle))
         elif string.startswith('!!'):
             synnodes = SynNodes(string)
             self._bnst[-1].tag_list().push_synnodes(synnodes)
         elif string.startswith('!') and not string.startswith('! ! !'):
             synnode = SynNode(string)
             self._bnst[-1].tag_list().push_synnode(synnode)
         elif string.startswith('EOS'):
             pass
         else:
             mrph = Morpheme(string, len(self.mrph_list()), self.newstyle)
             if not self._bnst:
                 bnst = Bunsetsu("*", len(self._bnst))
                 self._bnst.append(bnst)
             self._bnst[-1].push_mrph(mrph)
Ejemplo n.º 10
0
    def __init__(
        self,
        tag: Tag,
        dmid_offset: int,
        dtid: int,
        sid: str,
        doc_id: str,
        parent: Optional['BasePhrase'] = None,
        children: Optional[List['BasePhrase']] = None,
    ):
        """

        Args:
            tag (Tag): KNPの基本句オブジェクト
            dmid_offset (int): 文書中でこの基本句が始まるまでの文書レベル形態素ID
            dtid (int): 文書レベル基本句ID
            sid (str): 自身を含む文の文ID
            doc_id (str): 自身を含む文書の文書ID
            parent (Optional[BasePhrase]): 係り先
            children (List[BasePhrase]): 係り元
        """
        self.tag: Tag = tag
        self.dtid: int = dtid
        self.sid: str = sid
        self.doc_id: str = doc_id

        self._mrph2dmid: Dict[Morpheme, int] = {}
        dmid = dmid_offset
        for mrph in tag.mrph_list():
            self._mrph2dmid[mrph] = dmid
            dmid += 1

        self.content: Morpheme = self._get_content_word()
        self.content_dmid: int = self._mrph2dmid[self.content]
        self.parent: Optional['BasePhrase'] = parent
        self.children: List[
            'BasePhrase'] = children if children is not None else []