def select_dependency_structure(line): """係り受け構造を抽出します """ # KNP print("called select_dependency_structure()") knp = KNP(option = '-tab -anaphora') # 解析 result = knp.parse(line) # 文節リスト bnst_list = result.bnst_list() # 文節リストをidによるディクショナリ化する bnst_dic = dict((x.bnst_id, x) for x in bnst_list) tuples = [] for bnst in bnst_list: if bnst.parent_id != -1: # (from, to) tuples.append((select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation(bnst_dic[bnst.parent_id].fstring))) return tuples
class Solver(object): def __init__(self): self.juman = Juman() self.knp = KNP() def Q61(self): u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入) """ input_sentence = raw_input() result = self.juman.analysis(input_sentence.decode("utf8")) for mrph in result.mrph_list(): sys.stdout.write("{} ".format(mrph.midasi.encode("utf8"))) sys.stdout.write("\n") return def Q62(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞") # 名詞だけ表示 if len(s) > 0: print(s) data = u"" def Q63(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞") # 動詞だけ表示 if len(s) > 0: print(s) data = u"" def Q64(self): u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ ヒント: ディクショナリ、sorted 関数を使う """ data = u"" hist = {} for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) for mrph in result.mrph_list(): try: hist[mrph.genkei] += 1 except KeyError: hist[mrph.genkei] = 1 data = u"" for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True): print("{},{}".format(key.encode("utf8"), val)) def Q65(self): u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする """ data = u"" num = 0 denom = 0 for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) if verbose: logger.info("denom: {}".format(denom)) for mrph in result.mrph_list(): denom += 1 if mrph.hinsi == u"動詞": num += 1 continue if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"): num += 1 continue if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"): num += 1 continue data = u"" print("{}/{}={}".format(num, denom, float(num) / denom)) def Q66(self): u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = None for mrph in result.mrph_list(): if mrph.genkei == u"できる" or mrph.genkei == u"する": if buff is not None: extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8"))) if mrph.bunrui == u"サ変名詞": buff = mrph else: buff = None data = u"" for t in extract: print("{}+{}".format(t[0], t[1])) def Q67(self): u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = [] for mrph in result.mrph_list(): if mrph.genkei == u"の" and len(buff) == 1: buff.append(u"の") continue if mrph.hinsi == u"名詞": if len(buff) == 0: buff.append(mrph.genkei) continue if len(buff) == 2: extract.add((buff[0], mrph.genkei)) buff = [] data = u"" for t in extract: print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8"))) def Q68(self): u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入) """ input_sentence = raw_input() result = self.knp.parse(input_sentence.decode("utf8")) for bnst in result.bnst_list(): sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) sys.stdout.write("\n") return def Q69(self): u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return def Q70(self): u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return
# coding: utf-8 from pyknp import KNP sent = "先生は自転車で学校に行った。" knp = KNP() result = knp.parse(sent) # 文節 for bnst in result.bnst_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print(bnst.bnst_id, midasi, bnst.dpndtype, bnst.parent_id, bnst.fstring) # タグ print("-----------------------------------") for tag in result.tag_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print(tag.tag_id, midasi, tag.dpndtype, tag.parent_id, tag.fstring) # 形態素 print("-----------------------------------") for mrph in result.mrph_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print( mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis,
if __name__ == "__main__": sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') parser = argparse.ArgumentParser() parser.add_argument("--rule_file", dest='rule_file', type=str, action='store', required=True) parser.add_argument("--sentence", dest='sentence', type=str, action='store', required=True) parser.add_argument("--juman_command", dest='sentence', type=str, action='store', default="/mnt/violet/share/tool/juman++v2/bin/jumanpp") args = parser.parse_args() mrph_seq_match = MrphSeqMatch(args.rule_file) knp = KNP(jumancommand=args.jumancommand, option="-tab -dpnd") result = knp.parse(args.sentence) flag = mrph_seq_match.mrph_seq_match(result) print(flag)
class Analyser: """Class for syntactic Analysis """ def __init__(self, text: str, delimiter: str='\n'): self.text = text self.delimiter = delimiter self.sentences = util.split_text(self.text, delimiter) self.n_sentences = len(self.sentences) self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False) self.trees = self._trees() self.juman = Juman(jumanpp=False) self.rs_pos = self.calc_rs_pos() self.n_mrphs = self.calc_n_mrphs() self.n_chunks = self.calc_n_chunks() self.n_types = self.calc_n_types() self.mean_n_mrphs = None \ if self.n_sentences == 0 \ else self.n_mrphs / self.n_sentences self.rs_modality = self.calc_rs_modality() self.r_conditional = None \ if self.n_sentences == 0 \ else self.calc_n_conditionals() / self.n_sentences self.mean_tree_depths = self.calc_mean_tree_depths() def _trees(self) -> Tree: """Analyse dependency structure using KNP Returns: list(trf.Tree) """ results = [] for sentence in self.sentences: chunks = [] parse_result = self.knp.parse(sentence) for bnst in parse_result.bnst_list(): chunk = Chunk(chunk_id=bnst.bnst_id, link=bnst.parent_id, description=bnst.fstring) chunks.append(chunk) surfaces = [m.midasi for m in parse_result.mrph_list()] results.append(Tree(sentence, chunks, surfaces)) return results def calc_rs_pos(self) -> Dict[str, float]: """Calculate the ratio of each pos of words in input text Returns: float: the ratio of each pos of words in input text """ pos = [] # TODO: It may take a long time when the number of sentences are large for sentence in self.sentences: juman_result = self.juman.analysis(sentence) pos += [mrph.hinsi for mrph in juman_result.mrph_list()] pos_counter = Counter(pos) total = sum(pos_counter.values()) return {name: float(num) / total for name, num in pos_counter.items()} def calc_mean_tree_depths(self) -> float: """Calculate the mean depth of dependency tree Returns: float: The mean depth of trees """ return numpy.mean([tree.depth for tree in self.trees]) def calc_mean_sentence_length(self) -> float: """Calculate the mean length (# of morphs) of sentences Returns: float: the mean length of sentences """ result = 0 for sentence in self.sentences: juman_result = self.juman.analysis(sentence) result += len(juman_result.mrph_list()) return result / self.n_sentences def calc_n_sentences(self) -> int: """Calculate the number of sentences of input text Returns: int: the number of sentences of input text splitted by delimiter (default '。') """ return self.n_sentences def calc_n_types(self) -> int: """Calculate the number of types of input text Returns: int: the number of types of input text """ surfaces = [] for sentence in self.sentences: juman_result = self.juman.analysis(sentence) surfaces += [mrph.midasi for mrph in juman_result.mrph_list()] word_type_counter = Counter(surfaces) return len(word_type_counter) def calc_n_mrphs(self) -> int: """Calculate the number of morphemes of input text Returns: int: the number of morphemes of input text """ n_mrphs = 0 for sentence in self.sentences: juman_result = self.juman.analysis(sentence) n_mrphs += len(juman_result.mrph_list()) return n_mrphs def calc_n_chunks(self) -> int: # TODO: 共通化 return sum([len(self.knp.parse(s).bnst_list()) for s in self.sentences]) def calc_rs_modality(self) -> Dict[str, float]: modality_counter = Counter() for i, s in enumerate(self.sentences): chunks = [] for bnst in self.knp.parse(s).bnst_list(): chunk = Chunk(chunk_id=bnst.bnst_id, link=bnst.parent, description=bnst.fstring) chunks.append(chunk) s = "".join([chunk.description for chunk in chunks]) ms = set(re.findall("<モダリティ-(.+?)>", s)) modality_counter += Counter(ms) n = len(self.sentences) return dict([(k, float(c) / n) for k, c in modality_counter.items()]) def calc_n_conditionals(self) -> int: """ Returns: int: the number of sentences that contains one or more conditional clauses """ result = 0 tokenizer = Tokenizer() for s in self.sentences: for token in tokenizer.tokenize(s): if token.infl_form == '仮定形': result += 1 break return result def calc_mean_thesaurus_depths(self) -> float: # TODO: Share the parsing result surfaces = [] tokenizer = Tokenizer() for s in self.sentences: for token in tokenizer.tokenize(s): pos, pos1, _, _ = token.part_of_speech.split(',') if pos == '名詞' and pos1 == '一般': surfaces.append(token.surface) return wordnet.calc_mean_thesaurus_depths(surfaces)
class Parser: def __init__(self): """ Parser class """ self.knp = KNP() def __call__(self, text): """ Parser :param text: 入力テキスト :return: 解析結果(辞書型) """ chunks = [] links = [] try: result = self.knp.parse(text) except Exception as e: print("text \"" + text + "\" " + str(e)) return None # 単語配列生成 for bnst in result.bnst_list(): chunk = {"Independent": [], "Ancillary": [], "Link": None} for mrph in bnst.mrph_list(): tmp = {"surface": mrph.midasi, "original": mrph.genkei, "read": mrph.yomi, "position": [mrph.hinsi, mrph.bunrui], "conjugate": [mrph.katuyou1, mrph.katuyou2], } # 自立語 if tmp["position"][0] != "助詞" and \ tmp["position"][0] != "助動詞" and \ tmp["position"][0] != "判定詞" and \ tmp["position"][0] != "特殊": chunk["Independent"].append(tmp) # 付属語先頭 else: chunk["Ancillary"].append(tmp) # 文節情報と係り受け情報を登録 chunks.append(chunk) links.append(bnst.parent_id) # 係り受け情報付与 for parent_id, link_id in enumerate(links): if link_id > 0: chunks[parent_id]["Link"] = chunks[link_id]["Independent"] return {"Body": text, "Chunks": chunks} @classmethod def display(cls, info): """ 情報表示 :param info: 解析済み情報 :return: """ for parse in info["Chunks"]: print("Chunk: ") surface = "" read = "" original = "" for token in parse["Independent"]: surface = surface + token["surface"] read = read + token["read"] original = original + token["original"] print(" Independent: " + surface + "/" + read + " (" + original + ")") surface = "" read = "" original = "" for token in parse["Ancillary"]: surface = surface + token["surface"] read = read + token["read"] original = original + token["original"] print(" Ancillary: " + surface + "/" + read + " (" + original + ")") if parse["Link"]: surface = "" read = "" original = "" for token in parse["Link"]: surface = surface + token["surface"] read = read + token["read"] original = original + token["original"] print(" Link: " + surface + "/" + read + " (" + original + ")")