def test_lowercase_filter(self): tf = LowerCaseFilter() tokens = tf.apply(self.t.tokenize('Python JavaScript')) self.assertEqual(['python', ' ', 'javascript'], list(map(lambda token: token.surface, tokens))) tokens = tf.apply(self.t.tokenize('Python JavaScript')) self.assertEqual(['python', ' ', 'javascript'], list(map(lambda token: token.base_form, tokens)))
def main(): char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) text = '私は、渋谷ストリームでランチを食べる。' for token in analyzer.analyze(text): print(token) print('DONE')
def pros(self, stopwords=False, emojidict={}): self.stopwordslist = set('する') if stopwords: self.stopwordslist.add(stopwords) self.char_filters = [ EmojiCharFilter(emojidict), RegexReplaceCharFilter( r"https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+|<[:@#]|>|\^[!-/:-@¥[-`\[\]{-~]*$|[!#$%&'()\*\+\-\.,\/:;<=>?@\[\\\]^_`{|}~]", '') ] self.wordclass2 = ['自立', 'サ変接続', '一般', '固有名詞'] self.token_filters = [POSKeepFilter(['名詞', '形容詞']), LowerCaseFilter()] return self.getwords()
def get_words(titles, stop_words): '''titlesを形態素解析し、下処理を適用した単語リストを作成する''' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'text|[ -/:-@!0-9\[-~]', '') ] token_filters = [ POSKeepFilter(KEEP_FILTER), POSStopFilter(STOP_FILTER), LowerCaseFilter() ] tokenizer = Tokenizer(mmap=True) analyzer = Analyzer(tokenizer=tokenizer, char_filters=char_filters, token_filters=token_filters) title_list = [] for title in titles: word_list_per_title = [] for word in analyzer.analyze(title): # アルファベット、平仮名、カタカナ1文字の単語を除外 if (len(word.surface) == 1) \ and (re.compile('[~a-zあ-んア-ン]').fullmatch(word.surface)): continue # ストップワードを除外 if word.base_form in stop_words: continue hinshi_split = word.part_of_speech.split(',') hinshi_taple = (hinshi_split[0], hinshi_split[1]) if hinshi_taple in WEIGHTS_HINSHI_DICT.keys(): word_list_per_title += [word.base_form ] * WEIGHTS_HINSHI_DICT[hinshi_taple] else: word_list_per_title.append(word.base_form) title_list.append(word_list_per_title) # 全title中で1回しか出現しない単語を削除 dic = Dictionary(title_list) valid_word_list = [word_id for word_id, num in dic.dfs.items() if num > 1] title_list_2 = [] for title in title_list: word_list_per_title_2 = [ word for word in title if dic.token2id[word] in valid_word_list ] # 要素が0のtitleを除外 if len(word_list_per_title_2) > 0: title_list_2.append(word_list_per_title_2) return title_list_2, dic
def test_analyze(self): char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('蛇の目', 'janome') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter(), ExtractAttributeFilter('surface') ] a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) tokens = a.analyze('蛇の目はPure Pythonな形態素解析器です。') self.assertEqual(['janome', 'pure', 'python', 'な', '形態素解析器', 'です'], list(tokens))
def word_count_dict(df, pos=['名詞', '形容詞'], stop_words={}, ): # stop_wordsが指定されたいない場合はデフォルト(Slothlib)を使う if stop_words == {}: f = urlopen(STOP_WORD_URL) stop_words = set(f.read().decode("utf-8").split('\r\n')) df_message = df_talks[df_talks['type'] == 'message']['message'] messages = '\n'.join(list(df_message)) tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSKeepFilter(pos), LowerCaseFilter(), TokenCountFilter(sorted=True) ] analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters) # 記号や数字は削除 pos_res = analyzer.analyze(re.sub(r'[\d!-/:-@[-`{-~]', '', messages)) return {k: v for k, v in pos_res if k not in stop_words}
def tokenize(text): """ 文章を分かち書きします。 """ exclusion = ['助詞', '助動詞', '記号'] char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(exclusion), LowerCaseFilter() ] analyzer = Analyzer(char_filters, tokenizer, token_filters) ret = [] for sentense in text.split('。')[:-1]: ret.append([]) for token in analyzer.analyze(sentense.rstrip()): ret[-1].append(token.base_form) return ret
def janome_analyzer(): """ ref: https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter """ # standarize texts char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] analyze = Analyzer(char_filters, tokenizer, token_filters).analyze def _tokenizer(text, label): tokenized_text = " ".join( [wakati.surface for wakati in analyze(text.numpy().decode())]) return tokenized_text, label return _tokenizer
def test_analyzer_custom(self): char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('\s+', '') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) self.assertTrue(len(a.char_filters) == 2) self.assertIsInstance(a.char_filters[0], UnicodeNormalizeCharFilter) self.assertIsInstance(a.char_filters[1], RegexReplaceCharFilter) self.assertTrue(len(a.token_filters) == 3) self.assertIsInstance(a.token_filters[0], CompoundNounFilter) self.assertIsInstance(a.token_filters[1], POSStopFilter) self.assertIsInstance(a.token_filters[2], LowerCaseFilter)
def main(): text = '自然言語処理の基礎でも読もうかな。' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('自然言語処理', 'NLP') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) print(text) for token in analyzer.analyze(text): print(token) print('DONE')
from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter from janome.tokenfilter import LowerCaseFilter, CompoundNounFilter, POSKeepFilter, TokenCountFilter import logging logging.basicConfig(level='INFO') print('Analyzer example:') text = '蛇の目はPure Pythonな形態素解析器です。' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('蛇の目', 'janome') ] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) for token in a.analyze(text): print(token) print('') print('Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter') text = 'すもももももももものうち' token_filters = [POSKeepFilter(['名詞']), TokenCountFilter()] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(text): print('%s: %d' % (k, v))
columns = line.split("\t") index = columns[0].split("-")[0] if not index in texts: texts[index] = "" continue texts[index] = texts[index] + columns[1] #形態素解析 char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('\d+', '0') ] tokenizer = Tokenizer(mmap=True) token_filters = [ POSKeepFilter(["名詞", "形容詞", "副詞", "動詞"]), LowerCaseFilter(), ExtractAttributeFilter("base_form") ] analyzer = Analyzer(char_filters, tokenizer, token_filters) #単語抽出とストップワード stopwords = [] url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt" with urllib.request.urlopen(url) as response: stopwords = [w for w in response.read().decode().split('\r\n') if w != ""] texts_words = {} for k, v in texts.items():