Esempio n. 1
0
 def test_lowercase_filter(self):
     tf = LowerCaseFilter()
     tokens = tf.apply(self.t.tokenize('Python JavaScript'))
     self.assertEqual(['python', ' ', 'javascript'],
                      list(map(lambda token: token.surface, tokens)))
     tokens = tf.apply(self.t.tokenize('Python JavaScript'))
     self.assertEqual(['python', ' ', 'javascript'],
                      list(map(lambda token: token.base_form, tokens)))
Esempio n. 2
0
def main():
    char_filters = [UnicodeNormalizeCharFilter()]
    tokenizer = Tokenizer()
    token_filters = [CompoundNounFilter(), LowerCaseFilter()]
    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer,
                        token_filters=token_filters)

    text = '私は、渋谷ストリームでランチを食べる。'
    for token in analyzer.analyze(text):
        print(token)
    print('DONE')
 def pros(self, stopwords=False, emojidict={}):
     self.stopwordslist = set('する')
     if stopwords:
         self.stopwordslist.add(stopwords)
     self.char_filters = [
         EmojiCharFilter(emojidict),
         RegexReplaceCharFilter(
             r"https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+|<[:@#]|>|\^[!-/:-@¥[-`\[\]{-~]*$|[!#$%&'()\*\+\-\.,\/:;<=>?@\[\\\]^_`{|}~]",
             '')
     ]
     self.wordclass2 = ['自立', 'サ変接続', '一般', '固有名詞']
     self.token_filters = [POSKeepFilter(['名詞', '形容詞']), LowerCaseFilter()]
     return self.getwords()
Esempio n. 4
0
def get_words(titles, stop_words):
    '''titlesを形態素解析し、下処理を適用した単語リストを作成する'''

    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'text|[ -/:-@!0-9\[-~]', '')
    ]
    token_filters = [
        POSKeepFilter(KEEP_FILTER),
        POSStopFilter(STOP_FILTER),
        LowerCaseFilter()
    ]
    tokenizer = Tokenizer(mmap=True)
    analyzer = Analyzer(tokenizer=tokenizer,
                        char_filters=char_filters,
                        token_filters=token_filters)

    title_list = []
    for title in titles:
        word_list_per_title = []
        for word in analyzer.analyze(title):
            # アルファベット、平仮名、カタカナ1文字の単語を除外
            if (len(word.surface) == 1) \
                and (re.compile('[~a-zあ-んア-ン]').fullmatch(word.surface)):
                continue
            # ストップワードを除外
            if word.base_form in stop_words:
                continue
            hinshi_split = word.part_of_speech.split(',')
            hinshi_taple = (hinshi_split[0], hinshi_split[1])
            if hinshi_taple in WEIGHTS_HINSHI_DICT.keys():
                word_list_per_title += [word.base_form
                                        ] * WEIGHTS_HINSHI_DICT[hinshi_taple]
            else:
                word_list_per_title.append(word.base_form)
        title_list.append(word_list_per_title)
    # 全title中で1回しか出現しない単語を削除
    dic = Dictionary(title_list)
    valid_word_list = [word_id for word_id, num in dic.dfs.items() if num > 1]
    title_list_2 = []
    for title in title_list:
        word_list_per_title_2 = [
            word for word in title if dic.token2id[word] in valid_word_list
        ]
        # 要素が0のtitleを除外
        if len(word_list_per_title_2) > 0:
            title_list_2.append(word_list_per_title_2)
    return title_list_2, dic
Esempio n. 5
0
 def test_analyze(self):
     char_filters = [
         UnicodeNormalizeCharFilter(),
         RegexReplaceCharFilter('蛇の目', 'janome')
     ]
     tokenizer = Tokenizer()
     token_filters = [
         CompoundNounFilter(),
         POSStopFilter(['記号', '助詞']),
         LowerCaseFilter(),
         ExtractAttributeFilter('surface')
     ]
     a = Analyzer(char_filters=char_filters,
                  tokenizer=tokenizer,
                  token_filters=token_filters)
     tokens = a.analyze('蛇の目はPure Pythonな形態素解析器です。')
     self.assertEqual(['janome', 'pure', 'python', 'な', '形態素解析器', 'です'],
                      list(tokens))
Esempio n. 6
0
def word_count_dict(df, pos=['名詞', '形容詞'], stop_words={}, ):
    # stop_wordsが指定されたいない場合はデフォルト(Slothlib)を使う
    if stop_words == {}:
        f = urlopen(STOP_WORD_URL)
        stop_words = set(f.read().decode("utf-8").split('\r\n'))

    df_message = df_talks[df_talks['type'] == 'message']['message']
    messages = '\n'.join(list(df_message))
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSKeepFilter(pos),
        LowerCaseFilter(),
        TokenCountFilter(sorted=True)
    ]
    analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)
    # 記号や数字は削除
    pos_res = analyzer.analyze(re.sub(r'[\d!-/:-@[-`{-~]', '', messages))
    return {k: v for k, v in pos_res if k not in stop_words}
Esempio n. 7
0
def tokenize(text):
    """
    文章を分かち書きします。
    """
    exclusion = ['助詞', '助動詞', '記号']
    char_filters = [UnicodeNormalizeCharFilter()]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(exclusion),
        LowerCaseFilter()
    ]
    analyzer = Analyzer(char_filters, tokenizer, token_filters)

    ret = []
    for sentense in text.split('。')[:-1]:
        ret.append([])
        for token in analyzer.analyze(sentense.rstrip()):
            ret[-1].append(token.base_form)
    return ret
def janome_analyzer():
    """
    ref: 
    https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter
    """
    # standarize texts
    char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(['記号', '助詞']),
        LowerCaseFilter()
    ]
    analyze = Analyzer(char_filters, tokenizer, token_filters).analyze

    def _tokenizer(text, label):
        tokenized_text = " ".join(
            [wakati.surface for wakati in analyze(text.numpy().decode())])
        return tokenized_text, label

    return _tokenizer
Esempio n. 9
0
 def test_analyzer_custom(self):
     char_filters = [
         UnicodeNormalizeCharFilter(),
         RegexReplaceCharFilter('\s+', '')
     ]
     tokenizer = Tokenizer()
     token_filters = [
         CompoundNounFilter(),
         POSStopFilter(['記号', '助詞']),
         LowerCaseFilter()
     ]
     a = Analyzer(char_filters=char_filters,
                  tokenizer=tokenizer,
                  token_filters=token_filters)
     self.assertTrue(len(a.char_filters) == 2)
     self.assertIsInstance(a.char_filters[0], UnicodeNormalizeCharFilter)
     self.assertIsInstance(a.char_filters[1], RegexReplaceCharFilter)
     self.assertTrue(len(a.token_filters) == 3)
     self.assertIsInstance(a.token_filters[0], CompoundNounFilter)
     self.assertIsInstance(a.token_filters[1], POSStopFilter)
     self.assertIsInstance(a.token_filters[2], LowerCaseFilter)
Esempio n. 10
0
def main():
    text = '自然言語処理の基礎でも読もうかな。'

    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter('自然言語処理', 'NLP')
    ]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(['記号', '助詞']),
        LowerCaseFilter()
    ]

    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer,
                        token_filters=token_filters)

    print(text)
    for token in analyzer.analyze(text):
        print(token)
    print('DONE')
Esempio n. 11
0
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter
from janome.tokenfilter import LowerCaseFilter, CompoundNounFilter, POSKeepFilter, TokenCountFilter

import logging

logging.basicConfig(level='INFO')

print('Analyzer example:')
text = '蛇の目はPure Pythonな形態素解析器です。'
char_filters = [
    UnicodeNormalizeCharFilter(),
    RegexReplaceCharFilter('蛇の目', 'janome')
]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), LowerCaseFilter()]
a = Analyzer(char_filters=char_filters,
             tokenizer=tokenizer,
             token_filters=token_filters)
for token in a.analyze(text):
    print(token)

print('')
print('Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter')
text = 'すもももももももものうち'
token_filters = [POSKeepFilter(['名詞']), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)
for k, v in a.analyze(text):
    print('%s: %d' % (k, v))
Esempio n. 12
0
            columns = line.split("\t")
            index = columns[0].split("-")[0]
            if not index in texts:
                texts[index] = ""
                continue
            texts[index] = texts[index] + columns[1]

#形態素解析
char_filters = [
    UnicodeNormalizeCharFilter(),
    RegexReplaceCharFilter('\d+', '0')
]
tokenizer = Tokenizer(mmap=True)
token_filters = [
    POSKeepFilter(["名詞", "形容詞", "副詞", "動詞"]),
    LowerCaseFilter(),
    ExtractAttributeFilter("base_form")
]
analyzer = Analyzer(char_filters, tokenizer, token_filters)

#単語抽出とストップワード

stopwords = []
url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"

with urllib.request.urlopen(url) as response:
    stopwords = [w for w in response.read().decode().split('\r\n') if w != ""]

texts_words = {}

for k, v in texts.items():