Esempio n. 1
0
 def run(self):
     data = self.load()
     jumanpp = Juman()
     output = []
     for _, row in data.iterrows():
         zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True)
         splited = [
             mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list()
         ]
         if self.task_name == 'QA_B':
             qa_zenkaku = jaconv.h2z(
                 f"{row['target']}の{row['aspect']}は{row['sentiment']}",
                 ascii=True,
                 digit=True,
             )
         else:
             qa_zenkaku = " "
         qa_splited = [
             mrph.midasi
             for mrph in jumanpp.analysis(qa_zenkaku).mrph_list()
         ]
         output.append({
             "context": " ".join(splited),
             "qa": " ".join(qa_splited),
             "label": 1
         })
     self.dump(pd.DataFrame(output))
Esempio n. 2
0
def initialize(fword, tword, modelfn, start, debug):
    juman = Juman()
    # parse and check from_word
    ms_f = juman.analysis(fword).mrph_list()
    if len(ms_f) > 1:
        print(u'{} is parsed multiple words'.format(fword))
        exit(1)
    wm_f = ms_f[0]
    if not wm_f.repname:
        print(u'no repname with {}'.format(fword))
        exit(1)
    fword = wm_f.repname
    # parse and check to_word
    ms_t = juman.analysis(tword).mrph_list()
    if len(ms_t) > 1:
        print(u'{} is parsed multiple words'.format(tword))
        exit(1)
    wm_t = ms_t[0]
    if not wm_t.repname:
        print(u'no repname with {}'.format(tword))
        exit(1)
    tword = wm_t.repname
    # load and check model
    print(u'loading model...')
    if modelfn.split('.')[-1] == 'model':
        model = Word2Vec.load(modelfn)
    elif modelfn.split('.')[-1] == 'bin':
        model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore')
    if fword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(fword))
        exit(1)
    elif tword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(tword))
        exit(1)
    model.save('hs0.100m.500.5.18mgt100.model')

    t1 = time.clock() - start
    if debug:
        printtime(t1)

    print(u'constructing id2vocab map...')
    id2vocab = {}
    for i, v in enumerate(model.vocab):
        id2vocab[i] = v

    t2 = time.clock() - t1
    if debug:
        printtime(t2)

    print(u'constructing V...')
    V = []
    for v in model.vocab:
        V.append(model[v])
    V = np.vstack(V)

    t3 = time.clock() - t2
    if debug:
        printtime(t3)
    return fword, tword, model, V, id2vocab, t3
Esempio n. 3
0
class JumanTokenizer:
    def __init__(self):
        self.juman = Juman()

    def __call__(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 4
0
def analyzer():
    bc = BertClient(ip='bertserving', output_fmt='list')
    client = Elasticsearch('elasticsearch:9200')
    texts = []
    list_text = []
    jumanpp = Juman()
    query = request.args.get('q')
    result = jumanpp.analysis(query)
    for mrph in result.mrph_list():
        texts.append(mrph.midasi)
    list_text.append(" ".join(texts))
    query_vector = bc.encode(list_text, is_tokenized=False)[0]
    script_query = {
        "script_score": {
            "query": {
                "match": {
                    "source": "tb"
                }
            },
            "script": {
                "source":
                "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }

    response = client.search(index=INDEX_NAME,
                             body={
                                 "size": SEARCH_SIZE,
                                 "query": script_query
                             })
    return jsonify(response)
def main(DATA_ROOT):
    text_files = Path(DATA_ROOT).glob('**/*.txt')
    for text_file in text_files:
        with open(text_file) as f:
            content = f.read()

        content = re.sub(r"=+(.*?)=+", "\g<1>", content)
        content = re.sub(r"^\n", "", content, flags=re.MULTILINE)
        content = content.replace('<block>', '')
        content = content.replace('<math-element>', '')
        # In this case, 。 can be removed safely
        sentences = re.split(r"[。\n]", content)
        sentences = [line for line in sentences if len(line) != 0]
        sentences = [''.join(line.split()) for line in sentences]

        # Remove sentence which is not properly parsed
        val_sentences = []
        offsets = []

        juman = Juman()

        for sentence in tqdm(sentences):
            # Try to parse
            try:
                result = juman.analysis(sentence)

            except ValueError:
                print(sentence)

            except Exception as e:
                raise e

            current = 0
            offset = [0 for _ in range(len(sentence))]

            for mrph in result.mrph_list():
                current = current + len(mrph.midasi)
                try:
                    offset[current - 1] = 1

                except IndexError as e:
                    print(sentence)
                    print(current)
                    for _mrph in result.mrph_list():
                        print(_mrph.midasi)
                    raise e

                except Exception as e:
                    raise e

            val_sentences.append(sentence)
            offsets.append(offset)

        results = (sentences, offsets)

        file_name = text_file.name[:-4] + '.pickle'
        dic = text_file.parent

        with open(Path(dic, file_name), 'wb') as f:
            pickle.dump(results, f)
class SentimentAnalysis:
    def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str):
        self.jumanpp = Juman(command=jumanpp_command)

        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
        self.model = BertPosNegClassifier(bert_model)
        state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu'))
        self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()})
        self.model.eval()

    def get_prediction(self, sentence: str) -> int:
        print(sentence)
        text: str = self._segmentation(sentence)

        tokenized_text: List[str] = ['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]']

        indexed_tokens: List[int] = [self.tokenizer.convert_tokens_to_ids(tokenized_text)]

        tokens_tensor = torch.tensor(indexed_tokens)
        attention_mask_tensor = torch.tensor([[1] * len(tokenized_text)])
        # segments_tensors = torch.tensor([ [0] * len(indexed_tokens_list[0]) for _ in range(text_length)])

        output: torch.Tensor = self.model(tokens_tensor, attention_mask=attention_mask_tensor)
        prediction: int = torch.argmax(output[0]).item()  # 0 or 1

        if prediction == 0:
            prediction = -1

        return prediction

    def _segmentation(self, text: str) -> str:
        result = self.jumanpp.analysis(text)
        return ' '.join(mrph.midasi for mrph in result.mrph_list())
Esempio n. 7
0
 def _apply_jumanpp(self, inp: str) -> Tuple[str, str]:
     jumanpp = Juman(command=self.juman, option=self.juman_option)
     jumanpp_result = jumanpp.analysis(inp)
     jumanpp_out = jumanpp_result.spec() + 'EOS\n'
     jumanpp_conll_out = self._jumanpp2conll_one_sentence(
         jumanpp_out) + 'EOS\n'
     return jumanpp_out, jumanpp_conll_out
Esempio n. 8
0
class JumanService(object):
    def __init__(self):
        self.__juman = Juman()

    def analysis(self, string):
        formattedString = JumanKnpUtil.format_input_string(string)
        return self.__juman.analysis(formattedString)
Esempio n. 9
0
class JumanTokenizer():
    _trans_tables = str.maketrans({"\"": "", "@": "@", "#": "#"})

    def __init__(self, ):
        self.juman = Juman()

    def _preprocess(self, sentences):
        return sentences.replace(" ",
                                 "").replace("\n",
                                             "").translate(self._trans_tables)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]

    def _preprocess_list(self, datas):
        return [[x, self._preprocess(x)] for x in datas]

    def _tokenize_for_multi(self, datas):
        try:
            return [datas[0], self.tokenize(datas[1])]
        except:
            return []

    def tokenize_multi(self, datas, thread=cpu_count()):
        datas = self._preprocess_list(datas)
        num_of_datas = len(datas)

        with Pool(thread) as pool:
            imap = pool.imap_unordered(self._tokenize_for_multi, datas)
            result = list(tqdm(imap, total=num_of_datas))
        return result
Esempio n. 10
0
class JumanTokenizer():
    def __init__(self):
        self.juman = Juman(jumanpp=True)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 11
0
class JumanTokenizer:
    def __init__(self, command, options):
        self.juman = Juman(command, options)

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 12
0
class Tokenizer:
    def __init__(self):
        self.jumanpp = Juman()

        self.replace_map = {'"': '’', '#': '‘'}
        self.rev_map = {s: t for t, s in self.replace_map.items()}

    def tokenize(self, text):
        sequence = []

        for line in text.split('\n'):
            for sentence in line.split(' '):
                for tgt_symbol, sub_symbol in self.replace_map.items():
                    sentence = sentence.replace(tgt_symbol, sub_symbol)

                result = self.jumanpp.analysis(sentence)
                for mrph in result.mrph_list():
                    midasi = mrph.midasi
                    if midasi in self.rev_map:
                        midasi = self.rev_map[midasi]
                    sequence.append(midasi)

                sequence.append(' ')
            del sequence[-1]

            sequence.append('\n')
        del sequence[-1]

        return sequence
Esempio n. 13
0
    def string_word_point(self, df):
        jumanpp = Juman(jumanpp=False)
        tmp_word =[]
        df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf
        df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf
        df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf
        df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf
        df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf
        df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf
        
        #print(df_word_point)
        for i in range(len(df)):
             #URLだったら追加
            url=URL_hanbetu(df['comment'][i])
            if url != False:
                tmp = self.my_index(df_URL_point['URL'],url)
                df_URL_point = self.make_df_append(df_URL_point,tmp,url)

            #print("記号削除前")
            #print(df_word_point)
            #記号削除中
            print(df['comment'][i])
            df['comment'][i] = self.my_delete(df['comment'][i])
            # h:m:s -> hms に変更
            tmp_time = self.strtime_to_inttime(df['time'][i])
                    
            #時間ごとのコメント数計算
            tmp = self.my_index(df_time_point['time'],tmp_time)
            df_time_point = self.make_df_append(df_time_point,tmp,tmp_time)
            #wwwがあったら1追加なかったら0追加
            print(url)
            if False != self.www_hanbetu(df['comment'][i]) and url == False:
                df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)
            #拍手があったら1追加なかったら0追加
            if False != self.hakusyu_hanbetu(df['comment'][i]):
                df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)

                #構文解析
                result = jumanpp.analysis(df['comment'][i])
                #print(result)
                #分析結果からdf作成
                for token in result.mrph_list():
                    tmp_word = token.midasi   
                #名詞の出現数計算
                    if 0 != self.word_Classification(token.hinsi):
                    #名詞なら
                        if self.word_Classification(token.hinsi) == '名詞':    
                            tmp = self.my_index(df_word_point['word'],tmp_word)
                            df_word_point = self.make_df_append(df_word_point,tmp,tmp_word)
                        #名詞とその時の時間
                            df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True)

        return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
Esempio n. 14
0
 def morphological_analysis(self, text):
     jumanpp = Juman()
     ret = []
     text = self.remove_special_character(text)
     result = jumanpp.analysis(text)  # これでスペースで単語が区切られる
     for mrph in result.mrph_list():
         ret += self.modification(mrph.midasi)
     return ret
Esempio n. 15
0
class JumanTokenizer:
    def __init__(self):
        self.juman = Juman(command=config['Juman']['command'],
                           option=config['Juman']['option'])

    def __call__(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 16
0
class JumanTokenizer():
    def __init__(self):
        self.juman = Juman()

    def tokenize(self, text):
        # Jumanを用いて、日本語の文章を分かち書きする。
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 17
0
class JumanTokenize(object):
    """Runs JumanTokenizer."""
    def __init__(self):
        self.juman = Juman()

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 18
0
def counter(text, d):
    jumanapp = Juman()
    result = jumanapp.analysis(text)
    for mrph in result.mrph_list():
        if mrph.genkei in d:
            d[mrph.genkei] = d[mrph.genkei] + 1
        else:
            d[mrph.genkei] = 1
Esempio n. 19
0
    def test_juman_wrapper(self):
        juman = Juman()
        result = juman.analysis(u"これはペンです。")
        print(','.join(mrph.midasi for mrph in result))

        for mrph in result.mrph_list():
            assert isinstance(mrph, pyknp.Morpheme)
            print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
def juman_list(text):
    jumanpp = Juman()
    result = jumanpp.analysis(text)
    # アルファベットは全て "En" という文字列に置き換える
    wakati = [
        mrph.genkei if mrph.bunrui != "アルファベット" else "En"
        for mrph in result.mrph_list()
    ]
    return ",".join(wakati)
    def test_juman_wrapper(self):
        juman = Juman()
        result = juman.analysis(u"これはペンです。")
        print(','.join(mrph.midasi for mrph in result))

        for mrph in result.mrph_list():
            assert isinstance(mrph, pyknp.Morpheme)
            print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
Esempio n. 22
0
def juman_test():
    juman = Juman()
    print dir(juman)
    text = "テストテキスト"
    utext = unicode("".join(text.split()))
    print( u'"'+utext+u'"' )
    juman_result = juman.analysis( utext )
    for mrph in juman_result.mrph_list():
        print( '> ' + mrph.midasi + ' : ' + mrph.yomi + ' : ' + mrph.genkei )
Esempio n. 23
0
class JumanTokenizer():
    def __init__(self):
        self.juman = Juman()

    def tokenize(self, text):
        #pdb.set_trace()
        result = self.juman.analysis(text)
        #pdb.set_trace()
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 24
0
class JumanTokenizer:
    def __init__(self):
        self.juman = Juman()

    def tokenize(self, text):
        text.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)}))
        text = re.sub(r'\s', ' ', text)
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]
Esempio n. 25
0
def get_repname_using_jumanpp(genkei: str, pos: str) -> str:
    if pos == '助詞':
        return f'{genkei}/{genkei}'

    juman = Juman(option='-s 1')
    mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE)
    # 形態素解析が誤っていないか(=1形態素になっているか)をチェック
    if len(mrphs) == 1:
        return mrphs[0].repname

    return f'{genkei}/{genkei}'
    def test_juman_wrapper(self):
        try:
            juman = Juman(command=self.path_to_juman_command)
            result = juman.analysis("これはペンです。")
            logger.debug(','.join(mrph.midasi for mrph in result))

            for mrph in result.mrph_list():
                assert isinstance(mrph, pyknp.Morpheme)
                logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                      % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
        except ImportError:
            print('skip test_juman_wrapper')
Esempio n. 27
0
class JumanTokenizer:
    def __init__(self):
        self.juman = Juman()

    def parse(self, sentence: str, out=[]) -> list:
        out_list = out

        # Juman では句読点等は"特殊"に分類されている
        if "記号" in out:
            out_list.append("特殊")

        return [m.midasi for m in self.juman.analysis(sentence).mrph_list() 
                if m.hinsi not in out_list]
Esempio n. 28
0
def seg2word(seg):
    len_split = 1000
    # seg = seg_in.replace(' ', '\u3000')
    # seg = seg_in.replace(' ', ' ')
    len_seg = len(seg)
    seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)]

    juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp")
    return ' '.join([
        " ".join(
            [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()])
        for seg_part in seg_splits
    ])
Esempio n. 29
0
def bulk_predict(docs, batch_size=256):
    """Predict bert embeddings."""
    jumanpp = Juman(jumanpp=False)
    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i: i+batch_size]
        pre_embedding_docs = []
        for doc in batch_docs:
            for k in range(0, len(doc['question']), MAX_TXT_LENGTH)
                result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH])
                texts = [mrph.midasi for mrph in result.mrph_list()]
                pre_embedding_docs.append(" ".join(texts))
        embeddings = bc.encode(pre_embedding_docs,is_tokenized=True)
            yield emb
Esempio n. 30
0
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False):
    juman = Juman()
    output = ""
    # wakati
    result = juman.analysis(text)
    for mrph in result.mrph_list():
        if STEM_FLAG and mrph.hinsi in hinshi:
            output += mrph.repname.split("/")[0] + " "
        if DEBUG:
            print("stem:", mrph.repname)
            print("midashi:", mrph.repname)
            print("hinsi:", mrph.hinsi)
            print("yomi:", mrph.yomi)
    return output.strip()
def jumanpp():
    if request.method in ['POST'] and \
            request.headers['Content-Type'] == 'application/json':
        sentence = request.get_json()['sentence']
        juman = Juman(jumanpp=True)
        result = juman.analysis(sentence)
        words = []
        info = []
        for morph in result.mrph_list():
            words.append(morph.midasi)
            info.append('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                morph.hinsi, morph.bunrui, morph.katuyou1, morph.katuyou2,
                morph.yomi, morph.genkei, morph.repname, morph.imis))
        response = {'words': words, 'info': info}
        return jsonify(response)
    return jsonify({})
def title_clean(title_ls):
    tmp_ls = copy.deepcopy([title_ls])
    for i in range(len(tmp_ls) - 1):
        if tmp_ls[i] is None:
            del tmp_ls[i]
    for i in range(len(tmp_ls)):        
        tmp_ls[i] = normalize('NFKC', tmp_ls[i])
        tmp_ls[i] = tmp_ls[i].replace(' ', '')
        tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i])
    jumanpp = Juman()
    sep_ls = []
    for tmp in tmp_ls: 
        sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)]))
    return sep_ls[0]
Esempio n. 33
0
def read_home_timeline( session ):
    print( '[kazuha] - read timeline.' )
    juman = Juman()
    req = session.get( twitter.API_home_timeline, params = {} )
    if req.status_code == 200:
        timeline = json.loads( req.text )
        for tweet in timeline:
            u_tweet_text = unicode( "".join(tweet["text"].split()) )
            print( u'[kazuha] - read timeline: '+ u_tweet_text )
            juman_result = juman.analysis( u_tweet_text )
            for mrph in juman_result.mrph_list():
                print u"%s - (%s, %s)" % (mrph.genkei, mrph.hinsi, mrph.bunrui)
            #end for
        #end for
    else:
        print( '[kazuha] - read timeline: failure.' )
Esempio n. 34
0
def word_distance(s1, s2):
    juman = Juman()
    r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2))
    if r > len((s1 + s2).replace(" ", "")) // 2:
        return word_distance_en(s1, s2)
    sss = [
            set(
                [item.midasi for item in juman.analysis(ss).mrph_list() \
                        if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\
                        or '内容語' in item.imis
                        ]
            ) for ss in [s1, s2]
            ]
    if min(len(sss[0]), len(sss[1])) == 0:
        return 0
    return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
Esempio n. 35
0
class Solver(object):
    def __init__(self):
        self.juman = Juman()
        self.knp = KNP()

    def Q61(self):
        u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.juman.analysis(input_sentence.decode("utf8"))
        for mrph in result.mrph_list():
            sys.stdout.write("{} ".format(mrph.midasi.encode("utf8")))
        sys.stdout.write("\n")
        return

    def Q62(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞")  # 名詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q63(self):
        u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ

        ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定
        """
        data = u""
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞")  # 動詞だけ表示
                if len(s) > 0:
                    print(s)
                data = u""

    def Q64(self):
        u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ

        ヒント: ディクショナリ、sorted 関数を使う
        """
        data = u""
        hist = {}
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                for mrph in result.mrph_list():
                    try:
                        hist[mrph.genkei] += 1
                    except KeyError:
                        hist[mrph.genkei] = 1
                data = u""
        for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True):
            print("{},{}".format(key.encode("utf8"), val))

    def Q65(self):
        u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ

        ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする
        """

        data = u""
        num = 0
        denom = 0
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                if verbose:
                    logger.info("denom: {}".format(denom))
                for mrph in result.mrph_list():
                    denom += 1
                    if mrph.hinsi == u"動詞":
                        num += 1
                        continue
                    if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"):
                        num += 1
                        continue
                    if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"):
                        num += 1
                        continue
                data = u""

        print("{}/{}={}".format(num, denom, float(num) / denom))

    def Q66(self):
        u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = None
                for mrph in result.mrph_list():
                    if mrph.genkei == u"できる" or mrph.genkei == u"する":
                        if buff is not None:
                            extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8")))

                    if mrph.bunrui == u"サ変名詞":
                        buff = mrph
                    else:
                        buff = None
                data = u""
        for t in extract:
            print("{}+{}".format(t[0], t[1]))

    def Q67(self):
        u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):  # 入力文を1行ずつ読む
            data += line.decode("utf8")
            if line.strip() == "EOS":  # 1文が終わったら解析
                result = self.juman.result(data)
                buff = []
                for mrph in result.mrph_list():
                    if mrph.genkei == u"の" and len(buff) == 1:
                        buff.append(u"の")
                        continue
                    if mrph.hinsi == u"名詞":
                        if len(buff) == 0:
                            buff.append(mrph.genkei)
                            continue
                        if len(buff) == 2:
                            extract.add((buff[0], mrph.genkei))
                    buff = []
                data = u""
        for t in extract:
            print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8")))

    def Q68(self):
        u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入)
        """

        input_sentence = raw_input()
        result = self.knp.parse(input_sentence.decode("utf8"))
        for bnst in result.bnst_list():
            sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
        sys.stdout.write("\n")
        return

    def Q69(self):
        u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)
        return

    def Q70(self):
        u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ
        """

        data = u""
        extract = set()
        for line in iter(sys.stdin.readline, ""):
            data += line.decode("utf8")
            if line.strip() == "EOS":
                result = self.knp.result(data)
                for bnst in result.bnst_list():
                    if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2:
                        continue
                    extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list())))
                data = u""
        for bnst in extract:
            if len(bnst) > 0:
                print(bnst)

        return
Esempio n. 36
0
# coding: utf-8

from pyknp import Juman
import sys
import codecs
 
juman = Juman()

input_file = "../data/sample.txt"
f = codecs.open(input_file, 'r', 'utf-8')
f_out = codecs.open(input_file + '_juman_result.txt','w', 'utf-8')
for line in f:
    result = juman.analysis(line[:-1].replace(" ", ""))
    #print ' '.join(mrph.midasi for mrph in result)
    f_out.write(' '.join(mrph.midasi for mrph in result) + '\n')