def __init__( self, dataset: PASDataset, logger: Logger, use_knp_overt: bool = True, ) -> None: self.examples: List[PasExample] = dataset.examples self.cases: List[str] = dataset.target_cases self.bridging: bool = dataset.bridging self.coreference: bool = dataset.coreference self.relations: List[str] = dataset.target_cases + ( ['ノ'] * self.bridging) + (['='] * self.coreference) self.exophors: List[str] = dataset.target_exophors self.index_to_special: Dict[int, str] = { idx: token for token, idx in dataset.special_to_index.items() } self.documents: List[Document] = dataset.documents self.logger = logger self.use_knp_overt = use_knp_overt self.kc: bool = dataset.kc self.reader = dataset.reader cfg = configparser.ConfigParser() cfg.read(Path(__file__).parent.parent / 'analyzer' / 'config.ini') if 'default' not in cfg: logger.warning( 'Analyzer config not found. Instead, use default values.') cfg['default'] = {} section = cfg['default'] knp_command = section.get('knp_command', shutil.which('knp')) jumanpp_command = section.get('juman_command', shutil.which('jumanpp')) self.knp = KNP(command=knp_command, option='-tab -case2', jumancommand=jumanpp_command)
def generate_knowledge(sentence): ##knpで解析 knp = KNP(option='-tab -anaphora') result = knp.parse(sentence.replace(" ", "")) bnst_list = result.bnst_list() #文節辞書 bnst_dic = dict((x.bnst_id, x) for x in bnst_list) infos = [] info = dict((x, None) for x in info_elements) for bnst in bnst_list: place = get_gimonshi(bnst, bnst_dic) if (place == None): pass elif info[place] == None: info[place] = select_normalization_representative_notation( bnst.fstring) else: infos.append(info) del info info = dict((x, None) for x in info_elements) info[place] = select_normalization_representative_notation( bnst.fstring) return infos
def select_dependency_structure(line): """係り受け構造を抽出します """ # KNP knp = KNP(option='-tab -anaphora') # 解析 result = knp.parse(line) # 文節リスト bnst_list = result.bnst_list() # 文節リストをidによるディクショナリ化する bnst_dic = dict((x.bnst_id, x) for x in bnst_list) tuples = [] for bnst in bnst_list: if bnst.parent_id != -1: # (from, to) print("bnst_id:{} parent_id:{}\n".format(bnst.bnst_id, bnst.parent_id)) tuples.append( (select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation( bnst_dic[bnst.parent_id].fstring))) return tuples
def __init__(self): self._knp = KNP() self._valid_nominative_strings = \ self._load_file('valid_nominative_strings.txt') self._valid_nominative_semantic_markers = \ self._load_file('valid_nominative_semantic_markers.txt') self._volition_modalities = \ self._load_file('volition_modalities.txt') self._volition_voices = \ self._load_file('volition_voices.txt') self._non_volition_voices = \ self._load_file('non_volition_voices.txt') self._volition_adverb_repnames = \ self._load_file('volition_adverb_repnames.txt') self._non_volition_adverb_repnames = \ self._load_file('non_volition_adverb_repnames.txt') self._valid_adjective_predicate_suffix_repnames = \ self._load_file('valid_adjective_predicate_suffix_repnames.txt') self._non_volition_verbal_suffix_semantic_labels = \ self._load_file('non_volition_verbal_suffix_semantic_labels.txt') self._non_volition_verbal_suffix_repnames = \ self._load_file('non_volition_verbal_suffix_repnames.txt') self._non_volition_types = \ self._load_file('non_volition_types.txt') self._non_volition_head_repnames = \ self._load_file('non_volition_head_repnames.txt') self._non_volition_semantic_labels = \ self._load_file('non_volition_semantic_labels.txt')
def __init__( self, knp: Optional[KNP] = None, jumanpp: bool = True, fallback_juman: bool = True, ): self.knp = KNP(jumanpp=jumanpp) if knp is None else knp self.juman = self.knp.juman self.knp.parse("。") # self.knp.socketやsubprocessを生成させるため self.fallback_juman = fallback_juman
def tag(text: str) -> (list, list): ''' return tag_ids: [(子基本句ID, 親基本句ID), ...] ''' knp = KNP() tag_list = knp.parse(text).tag_list() tag_ids = list() for tag in tag_list: # 各基本句へのアクセス if tag.parent_id != -1: tag_ids.append((tag.tag_id, tag.parent_id)) return tag_list, tag_ids
def total_chunk2(text): from pyknp import KNP knp = KNP() result = knp.parse(text) num = 0 for bnst in result.bnst_list(): if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None": print("".join(mrph.midasi for mrph in bnst.mrph_list())) num += 1 return num
def __init__(self): '''initialize Examples -------- >>> nlp = JNLP() None ''' self.juman = Juman() self.KNP = KNP(option='-tab -anaphora')
def tag(text: str) -> (list, list): ''' return tag_ids: [(子基本句ID, 親基本句ID), ...] ''' knp = KNP() tag_list = knp.parse(text).tag_list() tag_ids = list() for tag in tag_list: # 各基本句へのアクセス if re.search('<格関係', tag.fstring): for i in re.findall(r'格関係\d', tag.fstring): tag_ids.append((int(re.sub(r'格関係', '', i)), tag.tag_id)) return tag_list, tag_ids
def get_u_gimonshi(sentence): line = sentence.replace(" ", "") knp = KNP(option='-tab -anaphora') result = knp.parse(line) bnst_list = result.bnst_list() bnst_dic = dict((x.bnst_id, x) for x in bnst_list) u_gimonshi = "" for bnst in bnst_list: place = question(bnst, bnst_dic) if place != None: u_gimonshi = place #print(u_gimonshi) return u_gimonshi
def main(): knp = KNP(jumanpp=True, option='-tab -assignf') parser = argparse.ArgumentParser() parser.add_argument("--npyfile", "-m") parser.add_argument("--vocabfile", "-v") parser.add_argument("--topk", "-k", type=int, default=5) parser.add_argument("--query", "-q", type=str, default='') parser.add_argument("--cnpyfile", "-c", type=str, default='') parser.add_argument("--cvocabfile", "-u", type=str, default='') args = parser.parse_args() npyfile = args.npyfile vocabfile = args.vocabfile topk = args.topk query = args.query cnpyfile = args.cnpyfile cvocabfile = args.cvocabfile w2vec = load_model(npyfile, vocabfile) c2vec = {} cvocabs = [] if cnpyfile and cvocabfile: c2vec = load_model(cnpyfile, cvocabfile) if query: parse_and_print(query, knp, w2vec, topk, c2vec) return while True: q = input() parse_and_print(q, knp, w2vec, topk, c2vec)
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/mountains_ja.knp.gz' vocabfile = sys.argv[2] # '../dataset/mountains_ja.vocab.gz' assert 'gz' in knpfile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(vocabfile, mode='wt', encoding='utf8') as ofp: write_vocab(knp, ifp, ofp)
class KnpService(object): MARK_EOS = "EOS" def __init__(self): self.__knp = KNP() def parse(self, string): formatted_string = JumanKnpUtil.format_input_string(string) return self.__knp.parse(formatted_string) def parse_all(self, strings): return [self.parse(string) for string in strings.split("\n")] def result(self, string_iterator): results = [] data = "" for line in string_iterator: data += line if line.strip() == KnpService.MARK_EOS: results.append(self.__knp.result(data)) data = "" return results def load_with_handler(self, string_iterator, handler): data = "" for line in string_iterator: data += line if line.strip() == KnpService.MARK_EOS: # 応急処置 (出力がまともに出ていない or 文が長すぎるなどが原因で格解析ができなくて構文解析だけが行われた場合をスキップ) if not (JumanKnpUtil.is_match_partly(r"\n\* \d+ ", data) ) and not (JumanKnpUtil.is_match_partly( r"Fell back to", data)): handler(self.__knp.result(data)) data = "" return def load_from_file_with_handler(self, filepath, handler): with open(filepath, "r") as f: results = self.load_with_handler(iter(f.readline, ""), handler) return results def load_from_file(self, filepath): with open(filepath, "r") as f: results = self.result(iter(f.readline, "")) return results
def total_chunk(text): from pyknp import KNP knp = KNP() sentences = [] num = 0 for stc in text.split("。"): if not stc == "": try: result = knp.parse(stc) for bnst in result.bnst_list(): if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None": num += 1 except Exception: pass return num
def load_knp_from_stream(f, juman_format=JUMAN_FORMAT.DEFAULT): """ KNPフォーマットの解析結果ファイルを解釈し、文節列オブジェクトを返す Args: f (file): KNPフォーマットの解析結果のファイルオブジェクト juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Yields: BList: 文節列オブジェクト """ knp = KNP() buf = "" for line in f: buf += line if line.startswith("EOS"): yield knp.result(buf, juman_format=juman_format) buf = ""
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz' wakatifile = sys.argv[2] # '../dataset/wikipedia.deps' assert 'gz' in knpfile assert 'gz' in wakatifile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(wakatifile, mode='wt', encoding='utf8') as ofp: write_wakati(knp, ifp, ofp, WORDFORM.LONGEST)
def read_knp_result_file(filename: str) -> List[BList]: """Read a KNP result file. Args: filename: A filename. Returns: A list of :class:`pyknp.knp.blist.BList` objects. """ knp = KNP() blists = [] with open(filename, "rt", encoding="utf-8", errors="replace") as f: chunk = "" for line in f: chunk += line if line.strip() == "EOS": blists.append(knp.result(chunk)) chunk = "" return blists
def get_context_words(self, sentence_size_limit=100): knp = KNP() knp_extractor = KNP_extractor(self.config.knp_index_db, self.config.knp_parent_dir, self.config.knp_sub_index_length) context_words = Counter() for index, sent_tuple in enumerate(self.sents[:sentence_size_limit]): sid = sent_tuple.sid.split('%')[0] sup_knp = knp_extractor.get_knp(sid) if not sup_knp: sys.stderr.write("fail to convert knp of %s.\n" % sid) continue try: result = knp.result(sup_knp.decode('utf-8')) context_words.update(self._get_sentence_args(result)) except: sys.stderr.write("fail to convert knp of %s.\n" % sid) context_words = dict(context_words) return context_words
def init(): r = request print(r.data) global knp knp = KNP() global queue queue = deque() result = {"text":"init done"} return jsonify(ResultSet=result)
def generate_utterance(u_sen, all_infos): #何を聞かれているかを判断 question = get_u_gimonshi(u_sen) #質問の答えがどこにあるかを検索 knp = KNP(option='-tab -anaphora') result = knp.parse(u_sen.replace(" ", "")) bnst_list = result.bnst_list() search_words = [] for bnst in bnst_list: search_words.append( create_infos.select_normalization_representative_notation( bnst.fstring)) search_point = -1 for search_word in search_words: for i, info in enumerate(all_infos): if (search_word in info.values()): search_point = i answer = "" if search_point == -1: answer = "No information" else: if all_infos[search_point][question] != None: #質問の答えがその場所にあるとき answer = all_infos[search_point][question] + "です" else: #質問の答えがその場所にないとき上下の情報を探索 if search_point == 0: if all_infos[search_point + 1][question] != None: answer = all_infos[search_point + 1][question] + "です" else: answer = "No information" else: if all_infos[search_point - 1][question] != None: answer = all_infos[search_point - 1][question] + "です" elif all_infos[search_point + 1][question] != None: answer = all_infos[search_point + 1][question] + "です" else: answer = "No information" return answer
def __init__(self, text: str, delimiter: str = '\n'): self.text = text self.delimiter = delimiter self.sentences = util.split_text(self.text, delimiter) self.n_sentences = len(self.sentences) self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False) self.trees = self._trees() self.juman = Juman(jumanpp=False) self.rs_pos = self.calc_rs_pos() self.n_mrphs = self.calc_n_mrphs() self.n_chunks = self.calc_n_chunks() self.n_types = self.calc_n_types() self.mean_n_mrphs = None \ if self.n_sentences == 0 \ else self.n_mrphs / self.n_sentences self.rs_modality = self.calc_rs_modality() self.r_conditional = None \ if self.n_sentences == 0 \ else self.calc_n_conditionals() / self.n_sentences self.mean_tree_depths = self.calc_mean_tree_depths()
def extract_poems(lines: List[str], jobs: int) -> List[Tuple]: knp = KNP(jumanpp=True) chunk_size = len(lines) // jobs + 1 arguments = [(lines[i:i + chunk_size], knp) for i in range(0, len(lines), chunk_size)] with mp.Pool(jobs) as p: checked_chunks = p.starmap(_extract_poems, arguments) poems = [] for chunk in checked_chunks: poems.extend(chunk) return poems
def evg(): parser = argparse.ArgumentParser() parser.add_argument("--output", "-o", default="", help="path to output") args = parser.parse_args() basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") knp = KNP() results = [] chunk = "" for line in codecs.getreader("utf-8")(getattr(sys.stdin, "buffer", sys.stdin)): chunk += line if line.strip() == "EOS": results.append(knp.result(chunk)) chunk = "" evg_ = EventGraph.build(results) if args.output: evg_.save(args.output) else: print(json.dumps(evg_.to_dict(), indent=4, ensure_ascii=False))
def _apply_knp(self, sent: str) -> str: self.logger.info(f'parse sentence: {sent}') knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_dpnd_option) knp_result = knp.parse(sent) if self.remote_knp is True: _, jumanpp_conll_out = self._apply_jumanpp(sent) clientsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.logger.info(f'connect to {self.knp_host}:{self.knp_port}') clientsock.connect((self.knp_host, self.knp_port)) clientsock.sendall(jumanpp_conll_out.encode('utf-8')) buf = [] while True: data = clientsock.recv(8192) data_utf8 = data.decode('utf-8') buf.append(data_utf8) if data_utf8.endswith('EOS\n'): break clientsock.close() conllu_out = ''.join(buf) self.logger.info(f'received {len(conllu_out)} chars from remote KNP') # modify KNP result by conllu result of remote KNP head_ids, dpnd_types = self._read_conllu_from_buf(conllu_out) self._modify_knp(knp_result, head_ids, dpnd_types) # add predicate-argument structures by KNP knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_case_option) knp_result_new = knp.parse_juman_result(knp_result.spec()) return knp_result_new.spec()
def test(): # ex.)echo "私は自然言語処理の研究をする" | juman | knp -tab -dpnd | python DependencyParser.py import codecs sys.stdin = codecs.getreader('UTF-8')(sys.stdin) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) knp = KNP() data = u"" for line in iter(sys.stdin.readline, ""): data += line if line.strip() == u"EOS": result = knp.result(data) DB = parseDependency(result.bnst_list(), head=False) DBhead = parseDependency(result.bnst_list(), head=True) print "parent-child" # for bnstrep in DB: # print bnstrep for bnstrep in DBhead: print bnstrep data = u""
def select_dependency_structure(line): """係り受け構造を抽出します """ # KNP print("called select_dependency_structure()") knp = KNP(option = '-tab -anaphora') # 解析 result = knp.parse(line) # 文節リスト bnst_list = result.bnst_list() # 文節リストをidによるディクショナリ化する bnst_dic = dict((x.bnst_id, x) for x in bnst_list) tuples = [] for bnst in bnst_list: if bnst.parent_id != -1: # (from, to) tuples.append((select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation(bnst_dic[bnst.parent_id].fstring))) return tuples
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # './wiki_00.knp.gz' outdir = sys.argv[2] assert '.knp.gz' in knpfile bn = os.path.basename(knpfile) head = os.path.join(outdir, bn.split('.')[0]) tffile_ab = head + '.ab.tf.gz' # './wiki_00.ab.tf.gz' dffile_ab = head + '.ab.df.gz' # './wiki_00.ab.df.gz' tffile_anob = head + '.anob.tf.gz'# './wiki_00.anob.tf.gz' dffile_anob = head + '.anob.df.gz'# './wiki_00.anob.df.gz' with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(tffile_ab, mode='wt', encoding='utf8') as ofp_tf_ab,\ gzip.open(dffile_ab, mode='wt', encoding='utf8') as ofp_df_ab,\ gzip.open(tffile_anob, mode='wt', encoding='utf8') as ofp_tf_anob,\ gzip.open(dffile_anob, mode='wt', encoding='utf8') as ofp_df_anob: parse_and_write(knp, ifp, ofp_tf_anob, ofp_tf_ab, ofp_df_anob, ofp_df_ab)
def __init__( self, word2vec_model: Word2VecModel, juman_command: str='jumanpp', specific_parts: Optional[List[str]]=None ) -> None: if specific_parts is None: specific_parts = ['普通名詞'] if juman_command == 'juman': self.juman: Union[Juman, Jumanpp] = Juman() elif juman_command == 'jumanpp': self.juman: Union[Juman, Jumanpp] = Jumanpp() else: raise AttributeError self.knp: KNP = KNP(jumancommand=juman_command) self.specific_parts: List[str] = specific_parts self.word2vec: Word2VecModel = word2vec_model
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz' vocabfile = sys.argv[2] # '../dataset/wikipedia.vocab' depsfile = sys.argv[3] # '../dataset/wikipedia.deps' # 全単語集合に関する頻度足切り vocab_thre = 100 assert 'gz' in vocabfile with gzip.open(vocabfile, mode='rt', encoding='utf8') as ifp: vocab = read_vocab(ifp, vocab_thre) # extract dependency pairs from a knp parsed file. # CoNLL: tokens = [(id,form, head,deprel)] assert 'gz' in knpfile assert 'gz' in depsfile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(depsfile, mode='wt', encoding='utf8') as ofp: write_deps(knp, vocab, ifp, ofp)
def reparse_knp(knp_file: Path, output_dir: Path, knp: KNP, keep_dep: bool) -> None: """係り受けなどを再付与""" blists: List[BList] = [] with knp_file.open() as fin: buff = '' for line in fin: if line.startswith('+') or line.startswith('*'): if keep_dep is False: buff += line[0] + '\n' # ex) + else: buff += ' '.join(line.split()[:2]) + '\n' # ex) + 3D else: buff += line if line.strip() == 'EOS': blists.append(knp.reparse_knp_result(buff)) buff = '' output_dir.joinpath(knp_file.name).write_text(''.join(blist.spec() for blist in blists))
def _extract_poems(chunk: List[str], knp: KNP) -> List[Tuple]: poems = [] for line in chunk: if WHITE_LIST.fullmatch(line): try: parsed = knp.parse(line) phrases = [[(mrph.midasi, count_mora(mrph.yomi), mrph.hinsi, mrph.bunrui, mrph.katuyou2) for mrph in bnst.mrph_list()] for bnst in parsed.bnst_list()] except ValueError: continue n = len(phrases) # the number of phrases mora_counts = [ cumsum(phrases[start:], n - start) for start in range(n) ] for index, mora_count in enumerate(mora_counts): if len(MORA_PATTERN - set(mora_count)) == 0: poem = extract_poem(phrases, index, mora_count) if criteria(poem): poems.append((poem, line)) return poems
def __init__(self): self.juman = Juman() self.knp = KNP()
class Solver(object): def __init__(self): self.juman = Juman() self.knp = KNP() def Q61(self): u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入) """ input_sentence = raw_input() result = self.juman.analysis(input_sentence.decode("utf8")) for mrph in result.mrph_list(): sys.stdout.write("{} ".format(mrph.midasi.encode("utf8"))) sys.stdout.write("\n") return def Q62(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞") # 名詞だけ表示 if len(s) > 0: print(s) data = u"" def Q63(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞") # 動詞だけ表示 if len(s) > 0: print(s) data = u"" def Q64(self): u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ ヒント: ディクショナリ、sorted 関数を使う """ data = u"" hist = {} for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) for mrph in result.mrph_list(): try: hist[mrph.genkei] += 1 except KeyError: hist[mrph.genkei] = 1 data = u"" for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True): print("{},{}".format(key.encode("utf8"), val)) def Q65(self): u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする """ data = u"" num = 0 denom = 0 for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) if verbose: logger.info("denom: {}".format(denom)) for mrph in result.mrph_list(): denom += 1 if mrph.hinsi == u"動詞": num += 1 continue if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"): num += 1 continue if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"): num += 1 continue data = u"" print("{}/{}={}".format(num, denom, float(num) / denom)) def Q66(self): u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = None for mrph in result.mrph_list(): if mrph.genkei == u"できる" or mrph.genkei == u"する": if buff is not None: extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8"))) if mrph.bunrui == u"サ変名詞": buff = mrph else: buff = None data = u"" for t in extract: print("{}+{}".format(t[0], t[1])) def Q67(self): u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = [] for mrph in result.mrph_list(): if mrph.genkei == u"の" and len(buff) == 1: buff.append(u"の") continue if mrph.hinsi == u"名詞": if len(buff) == 0: buff.append(mrph.genkei) continue if len(buff) == 2: extract.add((buff[0], mrph.genkei)) buff = [] data = u"" for t in extract: print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8"))) def Q68(self): u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入) """ input_sentence = raw_input() result = self.knp.parse(input_sentence.decode("utf8")) for bnst in result.bnst_list(): sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) sys.stdout.write("\n") return def Q69(self): u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return def Q70(self): u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return
# coding: utf-8 from pyknp import KNP sent = "先生は自転車で学校に行った。" knp = KNP() result = knp.parse(sent) # 文節 for bnst in result.bnst_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print(bnst.bnst_id, midasi, bnst.dpndtype, bnst.parent_id, bnst.fstring) # タグ print("-----------------------------------") for tag in result.tag_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print(tag.tag_id, midasi, tag.dpndtype, tag.parent_id, tag.fstring) # 形態素 print("-----------------------------------") for mrph in result.mrph_list(): midasi = "".join(mrph.midasi for mrph in bnst.mrph_list()) print( mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis,