class Tag(object): """ 格解析の単位となるタグ(基本句)の各種情報を保持するオブジェクト. """ def __init__(self, spec, tag_id=0, newstyle=False): self._mrph_list = MList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.synnodes = [] spec = spec.strip() if spec == '+': pass elif newstyle: items = spec.split(u"\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, u"|", False) elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal tag spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' self.features = Features(self.fstring) rep = self.features.get(u"正規化代表表記") if rep is not None: self.repname = rep def push_mrph(self, mrph): self._mrph_list.push_mrph(mrph) def spec(self): return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._mrph_list.spec()) def mrph_list(self): return self._mrph_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring def get_surface(self): return ''.join([mrph.midasi for mrph in self.mrph_list()])
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. Args: spec (str): KNP出力のうち文節に該当する箇所の文字列 bnst_id (int): 文節ID newstyle (bool): KNPフォーマットの種類 (公開版KNPの場合はFalse) Attributes: bnst_id (int): 文節ID midasi (str): 見出し parent (Bunsetsu): 親の文節オブジェクト parent_id (int): 親の文節ID children (list): 子の文節オブジェクトのリスト repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 fstring (str): feature情報 """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if not newstyle: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): """ 新しい基本句オブジェクトをセットする """ if len(self._tag_list) == 0 and len(self._mrph_list) > 0: raise Exception("Unsafe addition of tags!") self._tag_list.push_tag(tag) def set_midasi(self): """ midasiをセットする """ for i in range(len(self._tag_list)): self._tag_list[i].set_midasi() self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list()) def spec(self): """ 文節に対応するKNP出力 """ return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): """ 文節を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def tag_list(self): """ 文節を構成する全基本句オブジェクトを返す Returns: list: 基本句オブジェクトTagのリスト """ return self._tag_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring
class Tag(object): """ ある文に関する基本句列を保持するオブジェクト Args: spec (str): KNP出力 tag_id (int): 基本句ID juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Attributes: tag_id (int): 基本句ID midasi (str): 見出し parent (Tag): 親の基本句オブジェクト parent_id (int): 親の基本句ID children (list): 子の基本句オブジェクトのリスト dpndtype (str): 係り受けタイプ fstring (str): feature情報 repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 pred_repname (str): 用言代表表記 disambiguated_pred_repname (str): 標準用言代表表記 features (Features): 基本句のfeatureを表すFeatureオブジェクト pas (Pas): 基本句が述語の場合は項の情報(Pasオブジェクト), そうでない場合None """ def __init__(self, spec, tag_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self.features = None self._pstring = '' self.tag_id = tag_id self.pas = None self.synnodes = [] spec = spec.strip() if spec == '+': pass elif juman_format != JUMAN_FORMAT.DEFAULT: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] self.features = Features(self.fstring, "|", False) self.features._tag = self elif re.match(r'\+ (-?\d+)(\w)(.*)$', spec): match = re.match(r'\+ (-?\d+)(\w)(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal tag spec: %s" % spec) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' self.pred_repname = '' self.disambiguated_pred_repname = '' self.features = Features(self.fstring) self.features._tag = self normalized_repname = self.features.get("正規化代表表記") if normalized_repname is not None: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname is not None: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname pred_repname = self.features.get("用言代表表記") if pred_repname is not None: self.pred_repname = pred_repname disambiguated_pred_repname = self.features.get("標準用言代表表記") if disambiguated_pred_repname is not None: self.disambiguated_pred_repname = disambiguated_pred_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ self._mrph_list.push_mrph(mrph) def set_midasi(self): """ midasiをセットする """ self.midasi = self.get_surface() def spec(self): """ 基本句に対応するKNP出力 """ return "+ %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._mrph_list.spec()) def mrph_list(self): """ 基本句を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring def get_surface(self): """ 基本句の見出しを返す Returns: str: 基本句の見出し """ return ''.join(mrph.midasi for mrph in self.mrph_list())
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring) if match: self.repname = match.group(1) self.hrepname = '' match = re.search(r"<主辞代表表記:([^\"\s]+?)>", self.fstring) if match: self.hrepname = match.group(1) self.hprepname = '' match = re.search(r"<主辞’代表表記:([^\"\s]+?)>", self.fstring) if match: self.hprepname = match.group(1) def push_mrph(self, mrph): if self._tag_list: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): if not self._tag_list and self._mrph_list: sys.stderr.write("Unsafe addition of tags!\n") quit(1) self._tag_list.push_tag(tag) def spec(self): return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): return self._mrph_list def tag_list(self): return self._tag_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring def bnst_head(self): if len(self.tag_list()) == 1: return self.tag_list()[0] for tag in self.tag_list(): if '文節内' not in tag.features: return tag def recursive_children(self): def __recursive_children(bnst, bs): assert bnst not in bnst.children children = bnst.children if not children: return [] for c in children: __recursive_children(c, bs) bs.append(c) return bs return __recursive_children(self, []) def recursive_adnominals(self): modifiers = [] for c in self.children: if '<連体修飾>' in c.fstring: m_children = c.recursive_children() m_children.append(c) modifiers.extend(m_children) return modifiers
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. Args: spec (str): KNP出力のうち文節に該当する箇所の文字列 bnst_id (int): 文節ID juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Attributes: bnst_id (int): 文節ID midasi (str): 見出し parent (Bunsetsu): 親の文節オブジェクト parent_id (int): 親の文節ID children (list): 子の文節オブジェクトのリスト repname (str): 正規化代表表記 (normalized_repnameに同じ) normalized_repname (str): 正規化代表表記 head_repname (str): 主辞代表表記 head_prime_repname (str): 主辞’代表表記 fstring (str): feature情報 """ def __init__(self, spec, bnst_id=0, juman_format=JUMAN_FORMAT.DEFAULT): self._mrph_list = MList() self._tag_list = TList() self.midasi = '' self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif juman_format != JUMAN_FORMAT.DEFAULT: # TODO items = spec.split("\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: raise Exception("Illegal bunsetsu spec: %s" % spec) self.features = Features(self.fstring) # Extract 正規化代表表記 if juman_format == JUMAN_FORMAT.DEFAULT: self.repname = '' self.normalized_repname = '' self.head_repname = '' self.head_prime_repname = '' normalized_repname = self.features.get("正規化代表表記") if normalized_repname: self.repname = normalized_repname self.normalized_repname = normalized_repname head_repname = self.features.get("主辞代表表記") if head_repname: self.head_repname = head_repname head_prime_repname = self.features.get("主辞’代表表記") if head_prime_repname: self.head_prime_repname = head_prime_repname def push_mrph(self, mrph): """ 新しい形態素オブジェクトをセットする """ if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): """ 新しい基本句オブジェクトをセットする """ if len(self._tag_list) == 0 and len(self._mrph_list) > 0: raise Exception("Unsafe addition of tags!") self._tag_list.push_tag(tag) def set_midasi(self): """ midasiをセットする """ for i in range(len(self._tag_list)): self._tag_list[i].set_midasi() self.midasi = ''.join(mrph.midasi for mrph in self.mrph_list()) def spec(self): """ 文節に対応するKNP出力 """ return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): """ 文節を構成する全形態素オブジェクトを返す Returns: list: 形態素オブジェクトMorphemeのリスト """ return self._mrph_list def tag_list(self): """ 文節を構成する全基本句オブジェクトを返す Returns: list: 基本句オブジェクトTagのリスト """ return self._tag_list def pstring(self, string=None): """ draw_treeしたときに右側に出力する文字列を返す """ if string: self._pstring = string else: return self._pstring
class Bunsetsu(object): """ KNP による係り受け解析の単位である文節の各種情報を保持するオブジェクト. """ def __init__(self, spec, bnst_id=0, newstyle=False): self._mrph_list = MList() self._tag_list = TList() self.parent_id = -1 self.parent = None self.children = [] self.dpndtype = '' self.fstring = '' self._pstring = '' self.bnst_id = bnst_id spec = spec.strip() if spec == '*': pass elif newstyle: items = spec.split(u"\t") self.parent_id = int(items[2]) self.dpndtype = items[3] self.fstring = items[17] self.repname = items[6] elif re.match(r'\* (-?\d+)([DPIA])(.*)$', spec): match = re.match(r'\* (-?\d+)([DPIA])(.*)$', spec) self.parent_id = int(match.group(1)) self.dpndtype = match.group(2) self.fstring = match.group(3).strip() else: sys.stderr.write("Illegal bunsetsu spec: %s\n" % spec) quit(1) # Extract 正規化代表表記 if not newstyle: self.repname = '' match = re.search(r"<正規化代表表記:([^\"\s]+?)>", self.fstring) if match: self.repname = match.group(1) def push_mrph(self, mrph): if len(self._tag_list) > 0: self._tag_list[-1].push_mrph(mrph) self._mrph_list.push_mrph(mrph) def push_tag(self, tag): if len(self._tag_list) == 0 and len(self._mrph_list) > 0: sys.stderr.write("Unsafe addition of tags!\n") quit(1) self._tag_list.push_tag(tag) def spec(self): return "* %d%s %s\n%s" % (self.parent_id, self.dpndtype, self.fstring, self._tag_list.spec()) def mrph_list(self): return self._mrph_list def tag_list(self): return self._tag_list def pstring(self, string=None): if string: self._pstring = string else: return self._pstring