Esempio n. 1
0
def get_words(string, keep_pos=None):
    filters = []
    if keep_pos is None:
        filters.append(POSStopFilter(['記号']))  # 記号を除外
    else:
        filters.append(POSKeepFilter(keep_pos))  # 指定品詞を抽出
    filters.append(ExtractAttributeFilter('surface'))
    a = Analyzer(token_filters=filters)  # 後処理を指定
    return list(a.analyze(string))
Esempio n. 2
0
def make_tf_idf_result(debug, input_sentence):

    # make 字句解析機
    tokenizer = Tokenizer()
    token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])]
    analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)

    # 名詞の抽出
    file_path = "./all_sentence/all_sentence_0.txt"
    sentence_list = []
    word_list = []

    with open(file_path, encoding='utf-8') as f:
        sentence_list = f.readlines()

    if (not debug):
        sentence_list = change_sentence(sentence_list, input_sentence)

    for i in range(0, 201):
        tokens = analyzer.analyze(sentence_list[i])
        sentences_tmp = []
        for t in tokens:
            sentences_tmp.append(t.surface)

        word_list.append(" ".join(sentences_tmp))

    # nparray 化
    np_word_list = np.array(word_list)

    # ベクトル化する機器生成
    vec_tfidf = TfidfVectorizer()

    # ベクトル化
    X = vec_tfidf.fit_transform(np_word_list)

    # tf-idf と 名詞 を辞書として処理
    set_word_and_tf_idf = {}
    words = vec_tfidf.get_feature_names()
    for i, vec in zip(range(0, 1), X.toarray()):
        for w_id, tfidf in sorted(enumerate(vec),
                                  key=lambda x: x[1],
                                  reverse=True):
            word = words[w_id]
            set_word_and_tf_idf[word] = tfidf

    result_list = []

    for key in set_word_and_tf_idf.keys():
        if (set_word_and_tf_idf[key] > 0):
            print(key + ": " + str(set_word_and_tf_idf[key]))
            result_list.append({key: set_word_and_tf_idf[key]})
        else:
            break

    return result_list
Esempio n. 3
0
def get_words(titles, stop_words):
    '''titlesを形態素解析し、下処理を適用した単語リストを作成する'''

    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'text|[ -/:-@!0-9\[-~]', '')
    ]
    token_filters = [
        POSKeepFilter(KEEP_FILTER),
        POSStopFilter(STOP_FILTER),
        LowerCaseFilter()
    ]
    tokenizer = Tokenizer(mmap=True)
    analyzer = Analyzer(tokenizer=tokenizer,
                        char_filters=char_filters,
                        token_filters=token_filters)

    title_list = []
    for title in titles:
        word_list_per_title = []
        for word in analyzer.analyze(title):
            # アルファベット、平仮名、カタカナ1文字の単語を除外
            if (len(word.surface) == 1) \
                and (re.compile('[~a-zあ-んア-ン]').fullmatch(word.surface)):
                continue
            # ストップワードを除外
            if word.base_form in stop_words:
                continue
            hinshi_split = word.part_of_speech.split(',')
            hinshi_taple = (hinshi_split[0], hinshi_split[1])
            if hinshi_taple in WEIGHTS_HINSHI_DICT.keys():
                word_list_per_title += [word.base_form
                                        ] * WEIGHTS_HINSHI_DICT[hinshi_taple]
            else:
                word_list_per_title.append(word.base_form)
        title_list.append(word_list_per_title)
    # 全title中で1回しか出現しない単語を削除
    dic = Dictionary(title_list)
    valid_word_list = [word_id for word_id, num in dic.dfs.items() if num > 1]
    title_list_2 = []
    for title in title_list:
        word_list_per_title_2 = [
            word for word in title if dic.token2id[word] in valid_word_list
        ]
        # 要素が0のtitleを除外
        if len(word_list_per_title_2) > 0:
            title_list_2.append(word_list_per_title_2)
    return title_list_2, dic
Esempio n. 4
0
def make_tf(text):
    tokenizer = Tokenizer()
    token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])]
    analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)
    tokens = analyzer.analyze(text)

    word_list = []
    for t in tokens:
        for word in word_list:
            if (t.surface == list(word.keys())[0]):
                word[t.surface] = word[t.surface] + 1
                continue

        word_list.append({t.surface: 1})

    return word_list
Esempio n. 5
0
 def test_analyze(self):
     char_filters = [
         UnicodeNormalizeCharFilter(),
         RegexReplaceCharFilter('蛇の目', 'janome')
     ]
     tokenizer = Tokenizer()
     token_filters = [
         CompoundNounFilter(),
         POSStopFilter(['記号', '助詞']),
         LowerCaseFilter(),
         ExtractAttributeFilter('surface')
     ]
     a = Analyzer(char_filters=char_filters,
                  tokenizer=tokenizer,
                  token_filters=token_filters)
     tokens = a.analyze('蛇の目はPure Pythonな形態素解析器です。')
     self.assertEqual(['janome', 'pure', 'python', 'な', '形態素解析器', 'です'],
                      list(tokens))
Esempio n. 6
0
def tokenize(text):
    """
    文章を分かち書きします。
    """
    exclusion = ['助詞', '助動詞', '記号']
    char_filters = [UnicodeNormalizeCharFilter()]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(exclusion),
        LowerCaseFilter()
    ]
    analyzer = Analyzer(char_filters, tokenizer, token_filters)

    ret = []
    for sentense in text.split('。')[:-1]:
        ret.append([])
        for token in analyzer.analyze(sentense.rstrip()):
            ret[-1].append(token.base_form)
    return ret
def janome_analyzer_tf():
    """
    ref: 
    https://www.tensorflow.org/api_docs/python/tf/strings
    https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter
    """
    # standarize texts
    char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')]
    tokenizer = Tokenizer()
    token_filters = [CompoundNounFilter(), POSStopFilter(['記号', '助詞'])]
    analyze = Analyzer(char_filters, tokenizer, token_filters).analyze

    def _tokenizer(text, label):
        text = text.numpy().decode()
        tokenized_text = tf.strings.join(
            [wakati.surface for wakati in analyze(text)], separator=' ')
        tokenized_text = tf.strings.lower(tokenized_text)
        return tokenized_text, label

    return _tokenizer
def janome_analyzer():
    """
    ref: 
    https://mocobeta.github.io/janome/api/janome.html#module-janome.tokenfilter
    """
    # standarize texts
    char_filters = [RegexReplaceCharFilter(u'蛇の目', u'janome')]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(['記号', '助詞']),
        LowerCaseFilter()
    ]
    analyze = Analyzer(char_filters, tokenizer, token_filters).analyze

    def _tokenizer(text, label):
        tokenized_text = " ".join(
            [wakati.surface for wakati in analyze(text.numpy().decode())])
        return tokenized_text, label

    return _tokenizer
Esempio n. 9
0
    def _tokenize(self, data: ProcessedData) -> ProcessedData:
        train = data.train
        test = data.test

        tokenizer = Tokenizer()
        token_filters = [POSStopFilter(["記号", "助詞", "助動詞"])]
        analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)

        word_separations = []
        for i in range(len(train)):
            tokens = analyzer.analyze(train["lyric"].values[i])
            word_separations.append(" ".join([t.surface for t in tokens]))
        train["word_separation"] = word_separations

        word_separations = []
        for i in range(len(test)):
            tokens = analyzer.analyze(test["lyric"].values[i])
            word_separations.append(" ".join([t.surface for t in tokens]))
        test["word_separation"] = word_separations

        return ProcessedData(train=train, test=test)
Esempio n. 10
0
 def test_analyzer_custom(self):
     char_filters = [
         UnicodeNormalizeCharFilter(),
         RegexReplaceCharFilter('\s+', '')
     ]
     tokenizer = Tokenizer()
     token_filters = [
         CompoundNounFilter(),
         POSStopFilter(['記号', '助詞']),
         LowerCaseFilter()
     ]
     a = Analyzer(char_filters=char_filters,
                  tokenizer=tokenizer,
                  token_filters=token_filters)
     self.assertTrue(len(a.char_filters) == 2)
     self.assertIsInstance(a.char_filters[0], UnicodeNormalizeCharFilter)
     self.assertIsInstance(a.char_filters[1], RegexReplaceCharFilter)
     self.assertTrue(len(a.token_filters) == 3)
     self.assertIsInstance(a.token_filters[0], CompoundNounFilter)
     self.assertIsInstance(a.token_filters[1], POSStopFilter)
     self.assertIsInstance(a.token_filters[2], LowerCaseFilter)
Esempio n. 11
0
def main():
    text = '自然言語処理の基礎でも読もうかな。'

    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter('自然言語処理', 'NLP')
    ]
    tokenizer = Tokenizer()
    token_filters = [
        CompoundNounFilter(),
        POSStopFilter(['記号', '助詞']),
        LowerCaseFilter()
    ]

    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer,
                        token_filters=token_filters)

    print(text)
    for token in analyzer.analyze(text):
        print(token)
    print('DONE')
Esempio n. 12
0
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import UnicodeNormalizeCharFilter
from janome.tokenfilter import ExtractAttributeFilter, POSStopFilter

t = Tokenizer(mmap=True)

# An analyzer need 3 parameters, which are: char_filters, tokenizer, token_filters
a = Analyzer([UnicodeNormalizeCharFilter()], t,
             [POSStopFilter(['記号']),
              ExtractAttributeFilter('surface')])


# Split text into words, only remove punctuation.
def split_words(text):
    return list(a.analyze(text))
Esempio n. 13
0
 def test_pos_stop_filter(self):
     tf = POSStopFilter(['助詞', '記号', '動詞,非自立'])
     tokens = tf.apply(self.t.tokenize('行ってしまった。'))
     self.assertEqual(['動詞,自立,*,*', '助動詞,*,*,*'],
                      list(map(lambda token: token.part_of_speech, tokens)))