Ejemplo n.º 1
0
    def test_count_token_filter(self):
        tf = TokenCountFilter()
        d = dict(tf.apply(self.t.tokenize('すもももももももものうち')))
        self.assertEqual(1, d['すもも'])
        self.assertEqual(2, d['もも'])
        self.assertEqual(2, d['も'])
        self.assertEqual(1, d['の'])
        self.assertEqual(1, d['うち'])

        # sort by frequency
        tf = TokenCountFilter(sorted=True)
        counts = list(
            map(lambda x: x[1], tf.apply(self.t.tokenize('すもももももももものうち'))))
        self.assertEqual([2, 2, 1, 1, 1], counts)

        tf = TokenCountFilter('base_form')
        d = dict(tf.apply(self.t.tokenize('CountFilterで簡単に単語数が数えられます')))
        self.assertEqual(1, d['CountFilter'])
        self.assertEqual(1, d['で'])
        self.assertEqual(1, d['簡単'])
        self.assertEqual(1, d['に'])
        self.assertEqual(1, d['単語'])
        self.assertEqual(1, d['数'])
        self.assertEqual(1, d['が'])
        self.assertEqual(1, d['数える'])
        self.assertEqual(1, d['られる'])
        self.assertEqual(1, d['ます'])

        # invalid attribute name
        with self.assertRaises(Exception):
            TokenCountFilter('foo')
Ejemplo n.º 2
0
def word_count_dict(df, pos=['名詞', '形容詞'], stop_words={}, ):
    # stop_wordsが指定されたいない場合はデフォルト(Slothlib)を使う
    if stop_words == {}:
        f = urlopen(STOP_WORD_URL)
        stop_words = set(f.read().decode("utf-8").split('\r\n'))

    df_message = df_talks[df_talks['type'] == 'message']['message']
    messages = '\n'.join(list(df_message))
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSKeepFilter(pos),
        LowerCaseFilter(),
        TokenCountFilter(sorted=True)
    ]
    analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)
    # 記号や数字は削除
    pos_res = analyzer.analyze(re.sub(r'[\d!-/:-@[-`{-~]', '', messages))
    return {k: v for k, v in pos_res if k not in stop_words}
Ejemplo n.º 3
0
            if token.part_of_speech.split(",")[0] == '動詞':
                print("動詞 :",token.surface)
                dousi_list.append(token.surface)

        print("===============助詞だけ抽出=============")
        for token in t.tokenize(s):
            if token.part_of_speech.split(",")[0] == '助詞':
                print("助詞 :",token.surface)
                josi_list.append(token.surface)


        print("===========形態素リスト================")
        print(t.tokenize(s,wakati = True))

        #単語出現回数をみる。
        a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()])

        g_count = a.analyze(s)
        print(type(g_count))
        # <class 'generator'>


    print("====動詞リスト===")
    #print(dousi_list)
    dousi_list_kai = list(set(dousi_list))
    print(dousi_list_kai)
    #print("====名詞リスト前===")
    #print(meisi_list)
    print("====名詞リスト===")
    #重複を一つにする
    meisi_list_kai = list(set(meisi_list))
Ejemplo n.º 4
0
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter, TokenCountFilter

text = 'すもももももももものうち'
token_filters = [POSKeepFilter('名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)

for word, count in a.analyze(text):
    print(f'{word}: {count}')
Ejemplo n.º 5
0
 def __init__(self):
     token_filters = [
         POSKeepFilter(['名詞,固有名詞', '名詞,一般']),
         TokenCountFilter()
     ]
     self.a = Analyzer(token_filters=token_filters)