Example #1
0
 def __call__(self, document: "Document", blist: BList) -> Sentence:
     sentence = Sentence(document, blist.sid, Builder.ssid, blist)
     start: Optional[Tag] = None
     end: Optional[Tag] = None
     head: Optional[Tag] = None
     for tag in blist.tag_list():
         if not start:
             start = tag
         if not head and "節-主辞" in tag.features:
             head = tag
         if not end and "節-区切" in tag.features:
             end = tag
             if head:
                 EventBuilder()(sentence, start, head, end)
             start, end, head = None, None, None
     document.sentences.append(sentence)
     Builder.ssid += 1
     for bid, bnst in enumerate(blist.bnst_list()):
         for tag in bnst.tag_list():
             Builder.stid_bid_map[(sentence.ssid, tag.tag_id)] = bid
             Builder.stid_tag_map[(sentence.ssid, tag.tag_id)] = tag
     return sentence
Example #2
0
class Sentence:
    """ KWDLC(または Kyoto Corpus)の1文書を扱うクラス

    Attributes:
        blist (BList): KNPのBListオブジェクト
        doc_id (str): 文書ID
        bps (List[BasePhrase]): 含まれる基本句のリスト
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): 1文についてのKNPのtab出力
            dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID
            dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID
            doc_id(str): 文書ID
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """文ID"""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """形態素とその文書レベルIDを紐付ける辞書"""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """表層表現"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        return self.blist.bnst_list()

    def tag_list(self):
        return self.blist.tag_list()

    def mrph_list(self):
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """含まれる基本句の数"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'
Example #3
0
class Sentence:
    """A class to represent a single sentence.

    Attributes:
        blist (BList): BList object of pyknp.
        doc_id (str): The document ID of this sentence.
        bps (List[BasePhrase]): Base phrases in this sentence.
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): KNP format string of this sentence.
            dtid_offset (int): The document-wide tag ID of the previous base phrase.
            dmid_offset (int): The document-wide morpheme ID of the previous morpheme.
            doc_id(str): The document ID of this sentence.
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """A sentence ID."""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        """A document-wide tag ID."""
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """A mapping from morpheme to its document-wide ID."""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """A surface expression"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        """Return list of Bunsetsu object in pyknp."""
        return self.blist.bnst_list()

    def tag_list(self):
        """Return list of Tag object in pyknp."""
        return self.blist.tag_list()

    def mrph_list(self):
        """Return list of Morpheme object in pyknp."""
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """Number of base phrases in this sentence"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'