Example #1
0
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split(u"\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)
Example #2
0
    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.prev = None
        self.next = None
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT:  # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
Example #3
0
    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname
Example #4
0
 def draw_tag_tree(self, fh=None):
     """ タグ列の依存関係を木構造として表現して出力する. """
     tlist = TList()
     for tag in self.tag_list():
         tlist.push_tag(tag)
     tlist.draw_tree(fh=fh)
Example #5
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.

    Args:
        spec (str): KNP出力のうち文節に該当する箇所の文字列
        bnst_id (int): 文節ID
        newstyle (bool): KNPフォーマットの種類 (公開版KNPの場合はFalse)

    Attributes:
        bnst_id (int): 文節ID
        midasi (str): 見出し
        parent (Bunsetsu): 親の文節オブジェクト
        parent_id (int): 親の文節ID
        children (list): 子の文節オブジェクトのリスト
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        fstring (str): feature情報
    """
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname

    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        """ 新しい基本句オブジェクトをセットする """
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            raise Exception("Unsafe addition of tags!")
        self._tag_list.push_tag(tag)

    def set_midasi(self):
        """ midasiをセットする """
        for i in range(len(self._tag_list)):
            self._tag_list[i].set_midasi()
        self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list())

    def spec(self):
        """ 文節に対応するKNP出力 """
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._tag_list.spec())

    def mrph_list(self):
        """ 文節を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def tag_list(self):
        """ 文節を構成する全基本句オブジェクトを返す

        Returns:
            list: 基本句オブジェクトTagのリスト
        """
        return self._tag_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring
Example #6
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.
    """
    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)
            self.hrepname = ''
            match = re.search(r"<主辞代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.hrepname = match.group(1)
            self.hprepname = ''
            match = re.search(r"<主辞’代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.hprepname = match.group(1)

    def push_mrph(self, mrph):
        if self._tag_list:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        if not self._tag_list and self._mrph_list:
            sys.stderr.write("Unsafe addition of tags!\n")
            quit(1)
        self._tag_list.push_tag(tag)

    def spec(self):
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring,
                                  self._tag_list.spec())

    def mrph_list(self):
        return self._mrph_list

    def tag_list(self):
        return self._tag_list

    def pstring(self, string=None):
        if string:
            self._pstring = string
        else:
            return self._pstring

    def bnst_head(self):
        if len(self.tag_list()) == 1:
            return self.tag_list()[0]
        for tag in self.tag_list():
            if '文節内' not in tag.features:
                return tag

    def recursive_children(self):
        def __recursive_children(bnst, bs):
            assert bnst not in bnst.children
            children = bnst.children
            if not children:
                return []
            for c in children:
                __recursive_children(c, bs)
                bs.append(c)
            return bs

        return __recursive_children(self, [])

    def recursive_adnominals(self):
        modifiers = []
        for c in self.children:
            if '<連体修飾>' in c.fstring:
                m_children = c.recursive_children()
                m_children.append(c)
                modifiers.extend(m_children)
        return modifiers
Example #7
0
 def draw_tag_tree(self, fh=None):
     """ タグ列の依存関係を木構造として表現して出力する. """
     tlist = TList()
     for tag in self.tag_list():
         tlist.push_tag(tag)
     tlist.draw_tree(fh=fh)
Example #8
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.

    Args:
        spec (str): KNP出力のうち文節に該当する箇所の文字列
        bnst_id (int): 文節ID
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Attributes:
        bnst_id (int): 文節ID
        midasi (str): 見出し
        parent (Bunsetsu): 親の文節オブジェクト
        parent_id (int): 親の文節ID
        children (list): 子の文節オブジェクトのリスト
        repname (str): 正規化代表表記 (normalized_repnameに同じ)
        normalized_repname (str): 正規化代表表記
        head_repname (str): 主辞代表表記
        head_prime_repname (str): 主辞’代表表記
        fstring (str): feature情報
    """

    def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.midasi = ''
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO
            items = spec.split("\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            raise Exception("Illegal bunsetsu spec: %s" % spec)
        self.features = Features(self.fstring)

        # Extract 正規化代表表記
        if juman_format == JUMAN_FORMAT.DEFAULT:
            self.repname = ''
            self.normalized_repname = ''
            self.head_repname = ''
            self.head_prime_repname = ''

            normalized_repname = self.features.get("正規化代表表記")
            if normalized_repname:
                self.repname = normalized_repname
                self.normalized_repname = normalized_repname
            head_repname = self.features.get("主辞代表表記")
            if head_repname:
                self.head_repname = head_repname
            head_prime_repname = self.features.get("主辞’代表表記")
            if head_prime_repname:
                self.head_prime_repname = head_prime_repname


    def push_mrph(self, mrph):
        """ 新しい形態素オブジェクトをセットする """
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        """ 新しい基本句オブジェクトをセットする """
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            raise Exception("Unsafe addition of tags!")
        self._tag_list.push_tag(tag)

    def set_midasi(self):
        """ midasiをセットする """
        for i in range(len(self._tag_list)):
            self._tag_list[i].set_midasi()
        self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list())

    def spec(self):
        """ 文節に対応するKNP出力 """
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype,
                                  self.fstring, self._tag_list.spec())

    def mrph_list(self):
        """ 文節を構成する全形態素オブジェクトを返す

        Returns:
            list: 形態素オブジェクトMorphemeのリスト
        """
        return self._mrph_list

    def tag_list(self):
        """ 文節を構成する全基本句オブジェクトを返す

        Returns:
            list: 基本句オブジェクトTagのリスト
        """
        return self._tag_list

    def pstring(self, string=None):
        """ draw_treeしたときに右側に出力する文字列を返す """
        if string:
            self._pstring = string
        else:
            return self._pstring
Example #9
0
class Bunsetsu(object):
    """
    KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト.
    """

    def __init__(self, spec, bnst_id=0, newstyle=False):
        self._mrph_list = MList()
        self._tag_list = TList()
        self.parent_id = -1
        self.parent = None
        self.children = []
        self.dpndtype = ''
        self.fstring = ''
        self._pstring = ''
        self.bnst_id = bnst_id
        spec = spec.strip()
        if spec == '*':
            pass
        elif newstyle:
            items = spec.split(u"\t")
            self.parent_id = int(items[2])
            self.dpndtype = items[3]
            self.fstring = items[17]
            self.repname = items[6]
        elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec):
            match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec)
            self.parent_id = int(match.group(1))
            self.dpndtype = match.group(2)
            self.fstring = match.group(3).strip()
        else:
            sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec)
            quit(1)

        # Extract 正規化代表表記
        if not newstyle:
            self.repname = ''
            match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring)
            if match:
                self.repname = match.group(1)

    def push_mrph(self, mrph):
        if len(self._tag_list) > 0:
            self._tag_list[-1].push_mrph(mrph)
        self._mrph_list.push_mrph(mrph)

    def push_tag(self, tag):
        if len(self._tag_list) == 0 and len(self._mrph_list) > 0:
            sys.stderr.write("Unsafe addition of tags!\n")
            quit(1)
        self._tag_list.push_tag(tag)

    def spec(self):
        return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype,
                                  self.fstring, self._tag_list.spec())

    def mrph_list(self):
        return self._mrph_list

    def tag_list(self):
        return self._tag_list

    def pstring(self, string=None):
        if string:
            self._pstring = string
        else:
            return self._pstring
Example #10
0
def sprint_tag_tree(knp_result):
    tlist = TList()
    for tag in knp_result.tag_list():
        tlist.push_tag(tag)
    return tlist.sprint_tree()