def get_words(string, keep_pos=None): filters = [] if keep_pos is None: filters.append(POSStopFilter(['記号'])) # 記号を除外 else: filters.append(POSKeepFilter(keep_pos)) # 指定品詞を抽出 filters.append(ExtractAttributeFilter('surface')) a = Analyzer(token_filters=filters) # 後処理を指定 return list(a.analyze(string))
def make_tf_idf_result(debug, input_sentence): # make 字句解析機 tokenizer = Tokenizer() token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])] analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters) # 名詞の抽出 file_path = "./all_sentence/all_sentence_0.txt" sentence_list = [] word_list = [] with open(file_path, encoding='utf-8') as f: sentence_list = f.readlines() if (not debug): sentence_list = change_sentence(sentence_list, input_sentence) for i in range(0, 201): tokens = analyzer.analyze(sentence_list[i]) sentences_tmp = [] for t in tokens: sentences_tmp.append(t.surface) word_list.append(" ".join(sentences_tmp)) # nparray 化 np_word_list = np.array(word_list) # ベクトル化する機器生成 vec_tfidf = TfidfVectorizer() # ベクトル化 X = vec_tfidf.fit_transform(np_word_list) # tf-idf と 名詞 を辞書として処理 set_word_and_tf_idf = {} words = vec_tfidf.get_feature_names() for i, vec in zip(range(0, 1), X.toarray()): for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True): word = words[w_id] set_word_and_tf_idf[word] = tfidf result_list = [] for key in set_word_and_tf_idf.keys(): if (set_word_and_tf_idf[key] > 0): print(key + ": " + str(set_word_and_tf_idf[key])) result_list.append({key: set_word_and_tf_idf[key]}) else: break return result_list
def get_words(titles, stop_words): '''titlesを形態素解析し、下処理を適用した単語リストを作成する''' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'text|[ -/:-@!0-9\[-~]', '') ] token_filters = [ POSKeepFilter(KEEP_FILTER), POSStopFilter(STOP_FILTER), LowerCaseFilter() ] tokenizer = Tokenizer(mmap=True) analyzer = Analyzer(tokenizer=tokenizer, char_filters=char_filters, token_filters=token_filters) title_list = [] for title in titles: word_list_per_title = [] for word in analyzer.analyze(title): # アルファベット、平仮名、カタカナ1文字の単語を除外 if (len(word.surface) == 1) \ and (re.compile('[~a-zあ-んア-ン]').fullmatch(word.surface)): continue # ストップワードを除外 if word.base_form in stop_words: continue hinshi_split = word.part_of_speech.split(',') hinshi_taple = (hinshi_split[0], hinshi_split[1]) if hinshi_taple in WEIGHTS_HINSHI_DICT.keys(): word_list_per_title += [word.base_form ] * WEIGHTS_HINSHI_DICT[hinshi_taple] else: word_list_per_title.append(word.base_form) title_list.append(word_list_per_title) # 全title中で1回しか出現しない単語を削除 dic = Dictionary(title_list) valid_word_list = [word_id for word_id, num in dic.dfs.items() if num > 1] title_list_2 = [] for title in title_list: word_list_per_title_2 = [ word for word in title if dic.token2id[word] in valid_word_list ] # 要素が0のtitleを除外 if len(word_list_per_title_2) > 0: title_list_2.append(word_list_per_title_2) return title_list_2, dic
def make_tf(text): tokenizer = Tokenizer() token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])] analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters) tokens = analyzer.analyze(text) word_list = [] for t in tokens: for word in word_list: if (t.surface == list(word.keys())[0]): word[t.surface] = word[t.surface] + 1 continue word_list.append({t.surface: 1}) return word_list
def test_analyze(self): char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('蛇の目', 'janome') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter(), ExtractAttributeFilter('surface') ] a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) tokens = a.analyze('蛇の目はPure Pythonな形態素解析器です。') self.assertEqual(['janome', 'pure', 'python', 'な', '形態素解析器', 'です'], list(tokens))
def tokenize(text): """ 文章を分かち書きします。 """ exclusion = ['助詞', '助動詞', '記号'] char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(exclusion), LowerCaseFilter() ] analyzer = Analyzer(char_filters, tokenizer, token_filters) ret = [] for sentense in text.split('。')[:-1]: ret.append([]) for token in analyzer.analyze(sentense.rstrip()): ret[-1].append(token.base_form) return ret
def janome_analyzer_tf(): """ ref: https://www.tensorflow.org/api_docs/python/tf/strings https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter """ # standarize texts char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), POSStopFilter(['記号', '助詞'])] analyze = Analyzer(char_filters, tokenizer, token_filters).analyze def _tokenizer(text, label): text = text.numpy().decode() tokenized_text = tf.strings.join( [wakati.surface for wakati in analyze(text)], separator=' ') tokenized_text = tf.strings.lower(tokenized_text) return tokenized_text, label return _tokenizer
def janome_analyzer(): """ ref: https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter """ # standarize texts char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] analyze = Analyzer(char_filters, tokenizer, token_filters).analyze def _tokenizer(text, label): tokenized_text = " ".join( [wakati.surface for wakati in analyze(text.numpy().decode())]) return tokenized_text, label return _tokenizer
def _tokenize(self, data: ProcessedData) -> ProcessedData: train = data.train test = data.test tokenizer = Tokenizer() token_filters = [POSStopFilter(["記号", "助詞", "助動詞"])] analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters) word_separations = [] for i in range(len(train)): tokens = analyzer.analyze(train["lyric"].values[i]) word_separations.append(" ".join([t.surface for t in tokens])) train["word_separation"] = word_separations word_separations = [] for i in range(len(test)): tokens = analyzer.analyze(test["lyric"].values[i]) word_separations.append(" ".join([t.surface for t in tokens])) test["word_separation"] = word_separations return ProcessedData(train=train, test=test)
def test_analyzer_custom(self): char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('\s+', '') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] a = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) self.assertTrue(len(a.char_filters) == 2) self.assertIsInstance(a.char_filters[0], UnicodeNormalizeCharFilter) self.assertIsInstance(a.char_filters[1], RegexReplaceCharFilter) self.assertTrue(len(a.token_filters) == 3) self.assertIsInstance(a.token_filters[0], CompoundNounFilter) self.assertIsInstance(a.token_filters[1], POSStopFilter) self.assertIsInstance(a.token_filters[2], LowerCaseFilter)
def main(): text = '自然言語処理の基礎でも読もうかな。' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('自然言語処理', 'NLP') ] tokenizer = Tokenizer() token_filters = [ CompoundNounFilter(), POSStopFilter(['記号', '助詞']), LowerCaseFilter() ] analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) print(text) for token in analyzer.analyze(text): print(token) print('DONE')
from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import UnicodeNormalizeCharFilter from janome.tokenfilter import ExtractAttributeFilter, POSStopFilter t = Tokenizer(mmap=True) # An analyzer need 3 parameters, which are: char_filters, tokenizer, token_filters a = Analyzer([UnicodeNormalizeCharFilter()], t, [POSStopFilter(['記号']), ExtractAttributeFilter('surface')]) # Split text into words, only remove punctuation. def split_words(text): return list(a.analyze(text))
def test_pos_stop_filter(self): tf = POSStopFilter(['助詞', '記号', '動詞,非自立']) tokens = tf.apply(self.t.tokenize('行ってしまった。')) self.assertEqual(['動詞,自立,*,*', '助動詞,*,*,*'], list(map(lambda token: token.part_of_speech, tokens)))