Example #1
0
 def __init__(
     self,
     dataset: PASDataset,
     logger: Logger,
     use_knp_overt: bool = True,
 ) -> None:
     self.examples: List[PasExample] = dataset.examples
     self.cases: List[str] = dataset.target_cases
     self.bridging: bool = dataset.bridging
     self.coreference: bool = dataset.coreference
     self.relations: List[str] = dataset.target_cases + (
         ['ノ'] * self.bridging) + (['='] * self.coreference)
     self.exophors: List[str] = dataset.target_exophors
     self.index_to_special: Dict[int, str] = {
         idx: token
         for token, idx in dataset.special_to_index.items()
     }
     self.documents: List[Document] = dataset.documents
     self.logger = logger
     self.use_knp_overt = use_knp_overt
     self.kc: bool = dataset.kc
     self.reader = dataset.reader
     cfg = configparser.ConfigParser()
     cfg.read(Path(__file__).parent.parent / 'analyzer' / 'config.ini')
     if 'default' not in cfg:
         logger.warning(
             'Analyzer config not found. Instead, use default values.')
         cfg['default'] = {}
     section = cfg['default']
     knp_command = section.get('knp_command', shutil.which('knp'))
     jumanpp_command = section.get('juman_command', shutil.which('jumanpp'))
     self.knp = KNP(command=knp_command,
                    option='-tab -case2',
                    jumancommand=jumanpp_command)
Example #2
0
def generate_knowledge(sentence):
    ##knpで解析
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(sentence.replace(" ", ""))
    bnst_list = result.bnst_list()

    #文節辞書
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    infos = []
    info = dict((x, None) for x in info_elements)
    for bnst in bnst_list:
        place = get_gimonshi(bnst, bnst_dic)

        if (place == None):
            pass

        elif info[place] == None:
            info[place] = select_normalization_representative_notation(
                bnst.fstring)

        else:
            infos.append(info)
            del info
            info = dict((x, None) for x in info_elements)
            info[place] = select_normalization_representative_notation(
                bnst.fstring)

    return infos
Example #3
0
def select_dependency_structure(line):
    """係り受け構造を抽出します
    """

    # KNP
    knp = KNP(option='-tab -anaphora')

    # 解析
    result = knp.parse(line)

    # 文節リスト
    bnst_list = result.bnst_list()

    # 文節リストをidによるディクショナリ化する
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    tuples = []
    for bnst in bnst_list:
        if bnst.parent_id != -1:
            # (from, to)
            print("bnst_id:{} parent_id:{}\n".format(bnst.bnst_id,
                                                     bnst.parent_id))
            tuples.append(
                (select_normalization_representative_notation(bnst.fstring),
                 select_normalization_representative_notation(
                     bnst_dic[bnst.parent_id].fstring)))

    return tuples
Example #4
0
File: ishi.py Project: ku-nlp/ishi
    def __init__(self):
        self._knp = KNP()

        self._valid_nominative_strings = \
            self._load_file('valid_nominative_strings.txt')
        self._valid_nominative_semantic_markers = \
            self._load_file('valid_nominative_semantic_markers.txt')
        self._volition_modalities = \
            self._load_file('volition_modalities.txt')
        self._volition_voices = \
            self._load_file('volition_voices.txt')
        self._non_volition_voices = \
            self._load_file('non_volition_voices.txt')
        self._volition_adverb_repnames = \
            self._load_file('volition_adverb_repnames.txt')
        self._non_volition_adverb_repnames = \
            self._load_file('non_volition_adverb_repnames.txt')
        self._valid_adjective_predicate_suffix_repnames = \
            self._load_file('valid_adjective_predicate_suffix_repnames.txt')
        self._non_volition_verbal_suffix_semantic_labels = \
            self._load_file('non_volition_verbal_suffix_semantic_labels.txt')
        self._non_volition_verbal_suffix_repnames = \
            self._load_file('non_volition_verbal_suffix_repnames.txt')
        self._non_volition_types = \
            self._load_file('non_volition_types.txt')
        self._non_volition_head_repnames = \
            self._load_file('non_volition_head_repnames.txt')
        self._non_volition_semantic_labels = \
            self._load_file('non_volition_semantic_labels.txt')
Example #5
0
 def __init__(
     self,
     knp: Optional[KNP] = None,
     jumanpp: bool = True,
     fallback_juman: bool = True,
 ):
     self.knp = KNP(jumanpp=jumanpp) if knp is None else knp
     self.juman = self.knp.juman
     self.knp.parse("。")  # self.knp.socketやsubprocessを生成させるため
     self.fallback_juman = fallback_juman
Example #6
0
def tag(text: str) -> (list, list):
    '''
    return tag_ids: [(子基本句ID, 親基本句ID), ...]
    '''
    knp = KNP()
    tag_list = knp.parse(text).tag_list()
    tag_ids = list()
    for tag in tag_list:  # 各基本句へのアクセス
        if tag.parent_id != -1:
            tag_ids.append((tag.tag_id, tag.parent_id))
    return tag_list, tag_ids
Example #7
0
def total_chunk2(text):
    from pyknp import KNP
    knp = KNP()
    result = knp.parse(text)
    num = 0
    for bnst in result.bnst_list():
        if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None":
            print("".join(mrph.midasi for mrph in bnst.mrph_list()))
            num += 1

    return num
Example #8
0
    def __init__(self):
        '''initialize

        Examples
        --------
        >>> nlp = JNLP()
        None

        '''

        self.juman = Juman()
        self.KNP = KNP(option='-tab -anaphora')
Example #9
0
def tag(text: str) -> (list, list):
    '''
    return tag_ids: [(子基本句ID, 親基本句ID), ...]
    '''
    knp = KNP()
    tag_list = knp.parse(text).tag_list()
    tag_ids = list()
    for tag in tag_list:  # 各基本句へのアクセス
        if re.search('<格関係', tag.fstring):
            for i in re.findall(r'格関係\d', tag.fstring):
                tag_ids.append((int(re.sub(r'格関係', '', i)), tag.tag_id))
    return tag_list, tag_ids
Example #10
0
def get_u_gimonshi(sentence):
    line = sentence.replace(" ", "")
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(line)
    bnst_list = result.bnst_list()
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    u_gimonshi = ""
    for bnst in bnst_list:
        place = question(bnst, bnst_dic)
        if place != None:
            u_gimonshi = place

    #print(u_gimonshi)
    return u_gimonshi
Example #11
0
def main():
    knp = KNP(jumanpp=True, option='-tab -assignf')

    parser = argparse.ArgumentParser()
    parser.add_argument("--npyfile", "-m")
    parser.add_argument("--vocabfile", "-v")
    parser.add_argument("--topk", "-k", type=int, default=5)
    parser.add_argument("--query", "-q", type=str, default='')
    parser.add_argument("--cnpyfile", "-c", type=str, default='')
    parser.add_argument("--cvocabfile", "-u", type=str, default='')
    args = parser.parse_args()

    npyfile = args.npyfile
    vocabfile = args.vocabfile
    topk = args.topk
    query = args.query
    cnpyfile = args.cnpyfile
    cvocabfile = args.cvocabfile

    w2vec = load_model(npyfile, vocabfile)
    c2vec = {}
    cvocabs = []
    if cnpyfile and cvocabfile:
        c2vec = load_model(cnpyfile, cvocabfile)

    if query:
        parse_and_print(query, knp, w2vec, topk, c2vec)
        return
    while True:
        q = input()
        parse_and_print(q, knp, w2vec, topk, c2vec)
Example #12
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1]  # '../dataset/mountains_ja.knp.gz'
    vocabfile = sys.argv[2]  # '../dataset/mountains_ja.vocab.gz'
    assert 'gz' in knpfile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(vocabfile, mode='wt', encoding='utf8') as ofp:
        write_vocab(knp, ifp, ofp)
Example #13
0
class KnpService(object):
    MARK_EOS = "EOS"

    def __init__(self):
        self.__knp = KNP()

    def parse(self, string):
        formatted_string = JumanKnpUtil.format_input_string(string)
        return self.__knp.parse(formatted_string)

    def parse_all(self, strings):
        return [self.parse(string) for string in strings.split("\n")]

    def result(self, string_iterator):
        results = []
        data = ""
        for line in string_iterator:
            data += line
            if line.strip() == KnpService.MARK_EOS:
                results.append(self.__knp.result(data))
                data = ""
        return results

    def load_with_handler(self, string_iterator, handler):
        data = ""
        for line in string_iterator:
            data += line
            if line.strip() == KnpService.MARK_EOS:
                # 応急処置 (出力がまともに出ていない or 文が長すぎるなどが原因で格解析ができなくて構文解析だけが行われた場合をスキップ)
                if not (JumanKnpUtil.is_match_partly(r"\n\* \d+ ", data)
                        ) and not (JumanKnpUtil.is_match_partly(
                            r"Fell back to", data)):
                    handler(self.__knp.result(data))
                data = ""
        return

    def load_from_file_with_handler(self, filepath, handler):
        with open(filepath, "r") as f:
            results = self.load_with_handler(iter(f.readline, ""), handler)
        return results

    def load_from_file(self, filepath):
        with open(filepath, "r") as f:
            results = self.result(iter(f.readline, ""))
        return results
Example #14
0
    def total_chunk(text):
        from pyknp import KNP
        knp = KNP()

        sentences = []
        num = 0
        for stc in text.split("。"):
            if not stc == "":
                try:
                    result = knp.parse(stc)
                    for bnst in result.bnst_list():
                        if not "".join(mrph.midasi
                                       for mrph in bnst.mrph_list()) == "None":
                            num += 1
                except Exception:
                    pass

        return num
Example #15
0
def load_knp_from_stream(f, juman_format=JUMAN_FORMAT.DEFAULT):
    """
    KNPフォーマットの解析結果ファイルを解釈し、文節列オブジェクトを返す

    Args:
        f (file): KNPフォーマットの解析結果のファイルオブジェクト
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Yields:
        BList: 文節列オブジェクト
    """
    knp = KNP()
    buf = ""
    for line in f:
        buf += line
        if line.startswith("EOS"):
            yield knp.result(buf, juman_format=juman_format)
            buf = ""
Example #16
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz'
    wakatifile = sys.argv[2] # '../dataset/wikipedia.deps'

    assert 'gz' in knpfile
    assert 'gz' in wakatifile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(wakatifile, mode='wt', encoding='utf8') as ofp:
        write_wakati(knp, ifp, ofp, WORDFORM.LONGEST)
Example #17
0
def read_knp_result_file(filename: str) -> List[BList]:
    """Read a KNP result file.

    Args:
        filename: A filename.

    Returns:
        A list of :class:`pyknp.knp.blist.BList` objects.
    """
    knp = KNP()
    blists = []
    with open(filename, "rt", encoding="utf-8", errors="replace") as f:
        chunk = ""
        for line in f:
            chunk += line
            if line.strip() == "EOS":
                blists.append(knp.result(chunk))
                chunk = ""
    return blists
    def get_context_words(self, sentence_size_limit=100):
        knp = KNP()
        knp_extractor = KNP_extractor(self.config.knp_index_db, self.config.knp_parent_dir, self.config.knp_sub_index_length)
        context_words = Counter()
        for index, sent_tuple in enumerate(self.sents[:sentence_size_limit]):
            sid = sent_tuple.sid.split('%')[0]
            sup_knp = knp_extractor.get_knp(sid)
            if not sup_knp:
                sys.stderr.write("fail to convert knp of %s.\n" % sid)
                continue

            try:
                result = knp.result(sup_knp.decode('utf-8'))
                context_words.update(self._get_sentence_args(result))
            except:
                sys.stderr.write("fail to convert knp of %s.\n" % sid)
        
        context_words = dict(context_words)
        return context_words
def init():
    r = request
    print(r.data)
    global knp 
    knp = KNP()
    global queue
    queue = deque()

    result = {"text":"init done"}
    return jsonify(ResultSet=result)
Example #20
0
def generate_utterance(u_sen, all_infos):
    #何を聞かれているかを判断
    question = get_u_gimonshi(u_sen)

    #質問の答えがどこにあるかを検索
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(u_sen.replace(" ", ""))
    bnst_list = result.bnst_list()
    search_words = []
    for bnst in bnst_list:
        search_words.append(
            create_infos.select_normalization_representative_notation(
                bnst.fstring))

    search_point = -1
    for search_word in search_words:
        for i, info in enumerate(all_infos):
            if (search_word in info.values()):
                search_point = i

    answer = ""
    if search_point == -1:
        answer = "No information"

    else:
        if all_infos[search_point][question] != None:  #質問の答えがその場所にあるとき
            answer = all_infos[search_point][question] + "です"
        else:  #質問の答えがその場所にないとき上下の情報を探索
            if search_point == 0:
                if all_infos[search_point + 1][question] != None:
                    answer = all_infos[search_point + 1][question] + "です"
                else:
                    answer = "No information"
            else:
                if all_infos[search_point - 1][question] != None:
                    answer = all_infos[search_point - 1][question] + "です"
                elif all_infos[search_point + 1][question] != None:
                    answer = all_infos[search_point + 1][question] + "です"
                else:
                    answer = "No information"

    return answer
Example #21
0
 def __init__(self, text: str, delimiter: str = '\n'):
     self.text = text
     self.delimiter = delimiter
     self.sentences = util.split_text(self.text, delimiter)
     self.n_sentences = len(self.sentences)
     self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False)
     self.trees = self._trees()
     self.juman = Juman(jumanpp=False)
     self.rs_pos = self.calc_rs_pos()
     self.n_mrphs = self.calc_n_mrphs()
     self.n_chunks = self.calc_n_chunks()
     self.n_types = self.calc_n_types()
     self.mean_n_mrphs = None \
         if self.n_sentences == 0 \
         else self.n_mrphs / self.n_sentences
     self.rs_modality = self.calc_rs_modality()
     self.r_conditional = None \
         if self.n_sentences == 0 \
         else self.calc_n_conditionals() / self.n_sentences
     self.mean_tree_depths = self.calc_mean_tree_depths()
def extract_poems(lines: List[str], jobs: int) -> List[Tuple]:
    knp = KNP(jumanpp=True)
    chunk_size = len(lines) // jobs + 1
    arguments = [(lines[i:i + chunk_size], knp)
                 for i in range(0, len(lines), chunk_size)]
    with mp.Pool(jobs) as p:
        checked_chunks = p.starmap(_extract_poems, arguments)

    poems = []
    for chunk in checked_chunks:
        poems.extend(chunk)
    return poems
Example #23
0
def evg():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", "-o", default="", help="path to output")
    args = parser.parse_args()

    basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    knp = KNP()
    results = []
    chunk = ""
    for line in codecs.getreader("utf-8")(getattr(sys.stdin, "buffer",
                                                  sys.stdin)):
        chunk += line
        if line.strip() == "EOS":
            results.append(knp.result(chunk))
            chunk = ""
    evg_ = EventGraph.build(results)
    if args.output:
        evg_.save(args.output)
    else:
        print(json.dumps(evg_.to_dict(), indent=4, ensure_ascii=False))
Example #24
0
    def _apply_knp(self, sent: str) -> str:
        self.logger.info(f'parse sentence: {sent}')
        knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_dpnd_option)
        knp_result = knp.parse(sent)

        if self.remote_knp is True:
            _, jumanpp_conll_out = self._apply_jumanpp(sent)
            clientsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self.logger.info(f'connect to {self.knp_host}:{self.knp_port}')
            clientsock.connect((self.knp_host, self.knp_port))
            clientsock.sendall(jumanpp_conll_out.encode('utf-8'))

            buf = []
            while True:
                data = clientsock.recv(8192)
                data_utf8 = data.decode('utf-8')
                buf.append(data_utf8)
                if data_utf8.endswith('EOS\n'):
                    break
            clientsock.close()
            conllu_out = ''.join(buf)
            self.logger.info(f'received {len(conllu_out)} chars from remote KNP')

            # modify KNP result by conllu result of remote KNP
            head_ids, dpnd_types = self._read_conllu_from_buf(conllu_out)
            self._modify_knp(knp_result, head_ids, dpnd_types)

        # add predicate-argument structures by KNP
        knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_case_option)
        knp_result_new = knp.parse_juman_result(knp_result.spec())
        return knp_result_new.spec()
Example #25
0
def test():
    # ex.)echo "私は自然言語処理の研究をする"  | juman | knp -tab -dpnd | python DependencyParser.py

    import codecs
    sys.stdin  = codecs.getreader('UTF-8')(sys.stdin)
    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)

    knp = KNP()
    data = u""

    for line in iter(sys.stdin.readline, ""):
        data += line
        if line.strip() == u"EOS":
            result = knp.result(data)
            DB = parseDependency(result.bnst_list(), head=False)
            DBhead = parseDependency(result.bnst_list(), head=True)
            print "parent-child"
            # for bnstrep in DB:
                # print bnstrep
            for bnstrep in DBhead:
                print bnstrep
            data = u""
Example #26
0
def select_dependency_structure(line):
    """係り受け構造を抽出します
    """

    # KNP
    print("called select_dependency_structure()")
    knp = KNP(option = '-tab -anaphora')

    # 解析
    result = knp.parse(line)

    # 文節リスト
    bnst_list = result.bnst_list()

    # 文節リストをidによるディクショナリ化する
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    tuples = []
    for bnst in bnst_list:
        if bnst.parent_id != -1:
            # (from, to)
            tuples.append((select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation(bnst_dic[bnst.parent_id].fstring)))

    return tuples
Example #27
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1] # './wiki_00.knp.gz'
    outdir = sys.argv[2]
    assert '.knp.gz' in knpfile
    bn = os.path.basename(knpfile)
    head = os.path.join(outdir, bn.split('.')[0])
    tffile_ab = head + '.ab.tf.gz' # './wiki_00.ab.tf.gz'
    dffile_ab = head + '.ab.df.gz' # './wiki_00.ab.df.gz'
    tffile_anob = head + '.anob.tf.gz'# './wiki_00.anob.tf.gz'
    dffile_anob = head + '.anob.df.gz'# './wiki_00.anob.df.gz'

    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
        gzip.open(tffile_ab, mode='wt', encoding='utf8') as ofp_tf_ab,\
        gzip.open(dffile_ab, mode='wt', encoding='utf8') as ofp_df_ab,\
        gzip.open(tffile_anob, mode='wt', encoding='utf8') as ofp_tf_anob,\
        gzip.open(dffile_anob, mode='wt', encoding='utf8') as ofp_df_anob:
        parse_and_write(knp, ifp, ofp_tf_anob, ofp_tf_ab, ofp_df_anob, ofp_df_ab)
Example #28
0
    def __init__(
            self, word2vec_model: Word2VecModel,
            juman_command: str='jumanpp',
            specific_parts: Optional[List[str]]=None
    ) -> None:
        if specific_parts is None:
            specific_parts = ['普通名詞']

        if juman_command == 'juman':
            self.juman: Union[Juman, Jumanpp] = Juman()
        elif juman_command == 'jumanpp':
            self.juman: Union[Juman, Jumanpp] = Jumanpp()
        else:
            raise AttributeError
        self.knp: KNP = KNP(jumancommand=juman_command)

        self.specific_parts: List[str] = specific_parts

        self.word2vec: Word2VecModel = word2vec_model
Example #29
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1]  # '../dataset/wikipedia.knp.gz'
    vocabfile = sys.argv[2]  # '../dataset/wikipedia.vocab'
    depsfile = sys.argv[3]  # '../dataset/wikipedia.deps'

    # 全単語集合に関する頻度足切り
    vocab_thre = 100
    assert 'gz' in vocabfile
    with gzip.open(vocabfile, mode='rt', encoding='utf8') as ifp:
        vocab = read_vocab(ifp, vocab_thre)

    # extract dependency pairs from a knp parsed file.
    # CoNLL: tokens = [(id,form, head,deprel)]
    assert 'gz' in knpfile
    assert 'gz' in depsfile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(depsfile, mode='wt', encoding='utf8') as ofp:
        write_deps(knp, vocab, ifp, ofp)
Example #30
0
def reparse_knp(knp_file: Path, output_dir: Path, knp: KNP,
                keep_dep: bool) -> None:
    """係り受けなどを再付与"""
    blists: List[BList] = []
    with knp_file.open() as fin:
        buff = ''
        for line in fin:
            if line.startswith('+') or line.startswith('*'):
                if keep_dep is False:
                    buff += line[0] + '\n'  # ex) +
                else:
                    buff += ' '.join(line.split()[:2]) + '\n'  # ex) + 3D
            else:
                buff += line
            if line.strip() == 'EOS':
                blists.append(knp.reparse_knp_result(buff))
                buff = ''
    output_dir.joinpath(knp_file.name).write_text(''.join(blist.spec()
                                                          for blist in blists))
def _extract_poems(chunk: List[str], knp: KNP) -> List[Tuple]:
    poems = []
    for line in chunk:
        if WHITE_LIST.fullmatch(line):
            try:
                parsed = knp.parse(line)
                phrases = [[(mrph.midasi, count_mora(mrph.yomi), mrph.hinsi,
                             mrph.bunrui, mrph.katuyou2)
                            for mrph in bnst.mrph_list()]
                           for bnst in parsed.bnst_list()]
            except ValueError:
                continue
            n = len(phrases)  # the number of phrases
            mora_counts = [
                cumsum(phrases[start:], n - start) for start in range(n)
            ]
            for index, mora_count in enumerate(mora_counts):
                if len(MORA_PATTERN - set(mora_count)) == 0:
                    poem = extract_poem(phrases, index, mora_count)
                    if criteria(poem):
                        poems.append((poem, line))
    return poems
 def __init__(self):
     self.juman = Juman()
     self.knp = KNP()
class Solver(object):
    def __init__(self):
        self.juman = Juman()
        self.knp = KNP()

    def Q61(self):
        u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.juman.analysis(input_sentence.decode("utf8"))
        for mrph in result.mrph_list():
            sys.stdout.write("{} ".format(mrph.midasi.encode("utf8")))
        sys.stdout.write("\n")
        return

    def Q62(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞")  # 名詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q63(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞")  # 動詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q64(self):
        u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ

        ヒント: ディクショナリ、sorted 関数を使う
        """
        data = u""
        hist = {}
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                for mrph in result.mrph_list():
                    try:
                        hist[mrph.genkei] += 1
                    except KeyError:
                        hist[mrph.genkei] = 1
                data = u""
        for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True):
            print("{},{}".format(key.encode("utf8"), val))

    def Q65(self):
        u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ

        ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする
        """

        data = u""
        num = 0
        denom = 0
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                if verbose:
                    logger.info("denom: {}".format(denom))
                for mrph in result.mrph_list():
                    denom += 1
                    if mrph.hinsi == u"動詞":
                        num += 1
                        continue
                    if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"):
                        num += 1
                        continue
                    if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"):
                        num += 1
                        continue
                data = u""

        print("{}/{}={}".format(num, denom, float(num) / denom))

    def Q66(self):
        u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = None
                for mrph in result.mrph_list():
                    if mrph.genkei == u"できる" or mrph.genkei == u"する":
                        if buff is not None:
                            extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8")))

                    if mrph.bunrui == u"サ変名詞":
                        buff = mrph
                    else:
                        buff = None
                data = u""
        for t in extract:
            print("{}+{}".format(t[0], t[1]))

    def Q67(self):
        u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = []
                for mrph in result.mrph_list():
                    if mrph.genkei == u"の" and len(buff) == 1:
                        buff.append(u"の")
                        continue
                    if mrph.hinsi == u"名詞":
                        if len(buff) == 0:
                            buff.append(mrph.genkei)
                            continue
                        if len(buff) == 2:
                            extract.add((buff[0], mrph.genkei))
                    buff = []
                data = u""
        for t in extract:
            print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8")))

    def Q68(self):
        u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.knp.parse(input_sentence.decode("utf8"))
        for bnst in result.bnst_list():
            sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
        sys.stdout.write("\n")
        return

    def Q69(self):
        u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)
        return

    def Q70(self):
        u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)

        return
Example #34
0
# coding: utf-8
from pyknp import KNP

sent = "先生は自転車で学校に行った。"
knp = KNP()
result = knp.parse(sent)

# 文節
for bnst in result.bnst_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(bnst.bnst_id, midasi, bnst.dpndtype, bnst.parent_id, bnst.fstring)

# タグ
print("-----------------------------------")
for tag in result.tag_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(tag.tag_id, midasi, tag.dpndtype, tag.parent_id, tag.fstring)

# 形態素
print("-----------------------------------")
for mrph in result.mrph_list():
    midasi = "".join(mrph.midasi for mrph in bnst.mrph_list())
    print(
        mrph.midasi,
        mrph.yomi,
        mrph.genkei,
        mrph.hinsi,
        mrph.bunrui,
        mrph.katuyou1,
        mrph.katuyou2,
        mrph.imis,