def fn_start_document_summarize(text): # 形態素解析(単語単位に分割する) tokenizer = JanomeTokenizer('japanese') char_filters=[UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')] token_filters=[POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')] analyzer = Analyzer( char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters ) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] #print(corpus, len(corpus)) # 文書要約処理実行 parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を原文書の3割程度抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # 文書の重要なポイントは2割から3割といわれている?ので、それを参考にsentences_countを設定する。 N = 3 summary = summarizer(document=parser.document, sentences_count = N if len(corpus) < 100 else int(len(corpus)/100)) #summary = summarizer(document=parser.document, sentences_count=1) str = '' for sentence in summary: str += (text[corpus.index(sentence.__str__())]) return str
def tense_analyze(self, text, sentences_count): # 1行1文となっているため、改行コードで分離 # sentences = [t for t in text.split('\n')] sentences = [t for t in text.split('。')] # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全>てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞>・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外>する summary = summarizer(document=parser.document, sentences_count=sentences_count) return sentences, corpus, summary
def janome_document_summarize(document): # 形態素解析(単語単位に分割する) analyzer = Analyzer(char_filters=[ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], tokenizer=JanomeTokenizer(), token_filters=[ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) text = re.findall("[^。]+。?", document.replace('\n', '')) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' ', '。', '\n'] N = int(len(corpus) / 10 * 3) if N <= 0: N = 3 summary = summarizer(document=parser.document, sentences_count=N) rst = '' print('\n要約:') for sentence in summary: print(text[corpus.index(sentence.__str__())]) rst += text[corpus.index(sentence.__str__())] return summary, rst
def summarize(text): sentences = [t for t in text.split('\n')] analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] summary = summarizer(document=parser.document, sentences_count=3) x = "" for sentence in summary: x += sentences[corpus.index(sentence.__str__())] return x
def make_corpus(docs, debug=False): """ 複数の文書からコーパスを作成する @docs 文書のリスト @return トークナイズされた文書のリスト """ docs = list( map( lambda d: list( filter(lambda x: x.strip() != "", re.split("\n|。", d.lower())) ), docs)) docs = [ list(map(lambda x: mojimoji.zen_to_han(x), lines)) for lines in docs ] analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [ list( itertools.chain.from_iterable( [list(analyzer.analyze(l)) for l in lines])) for lines in docs ] if debug: print("\n".join(corpus)) return corpus
def __init__(self): self.nlp = spacy.load('ja_ginza') self.analyzer = Analyzer( [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')] # 名詞・形容詞・副詞・動詞の原型のみ )
def preprocess(doc, debug=False): """ ドキュメントを引数にとってそれを前処理した上でトークナイズされた文のリストに分割する @param doc 対象のドキュメント @return 前処理されたドキュメントに含まれる文のリスト """ doc = doc.lower() lines = re.split("\n|。", doc) lines = list(filter(lambda x: x != "", map(lambda x: x.strip(), lines))) sentences = copy.deepcopy(lines) lines = list(map(lambda x: mojimoji.zen_to_han(x), lines)) analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [' '.join(analyzer.analyze(l)) + '。' for l in lines] if debug: print("\n".join(corpus)) return sentences, corpus
def set_analyzer(self): # 形態素解析器を作る self.analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ )
def get_summary(self): # 1行1文となっているため、改行コードで分離 sentences = [t for t in self._text.split('\n')] for i in range(1): print(sentences[i]) # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] """ for i in range(2): print(corpus[i]) """ # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。 # 今年 11 月 SIer Web サービス 会社 転職 する。 """ from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer """ # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外する self.summary = summarizer(document=parser.document, sentences_count=2) # 元の文を表示 for sentence in self.summary: print(sentences[corpus.index(sentence.__str__())])
def __init__(self, root, fields=DOC_PATTERN, sent_pattern=SENT_PATTERN, encoding='utf8', **kargs): """ :param root: corpusが入っているdir :param fields: 対象となるcorpus :param encoding: """ PlaintextCorpusReader.__init__( self, root, fields, word_tokenizer=JanomeTokenizer(), sent_tokenizer=RegexpTokenizer(sent_pattern), encoding=encoding)
def lexrank_sumy(text, lang_number): ''' LexRankを実行する text:要約したい文章 ''' text = text.strip() sentences = re.findall("[^。]+。?", text) analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) print(lang_number) if lang_number == '1': corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) print("japanese") elif lang_number == '2': corpus = [' '.join(analyzer.analyze(s)) + '. ' for s in sentences] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('english')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] summary = summarizer(document=parser.document, sentences_count=3) summary_text = '' for sentence in summary: summary_text = summary_text + sentences[corpus.index( sentence.__str__())] + '\n' return summary_text
def preprocess_target(doc, debug=False): doc = list(filter(lambda x: x.strip() != "", re.split("\n|。", doc.lower()))) sentences = copy.deepcopy(doc) doc = [mojimoji.zen_to_han(line) for line in doc] analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [list(analyzer.analyze(line)) for line in doc] if debug: print(corpus) return sentences, corpus
def prepare_char(lang, shuffle=None, dict_limit=0): # shuffle characters in each word: word -> dorw (shuffle="shuffle") or randomly swap words (shuffle="random") texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids if lang == "CH": texts, labels, labels_index = get_ChnSenti_texts( "ChnSentiCorp_htl_unba_10000/") elif lang == "JP": janome_tokenizer = JanomeTokenizer() datasize = 10000 texts, labels = get_Rakuten_texts(datasize) data_size = len(texts) processed_texts = [] word_vocabulary = {} # build word_vocabulary for i, text in enumerate(tqdm(texts)): if lang == "CH": t_list = list(jieba.cut(text, cut_all=False)) elif lang == "JP": t_list = janome_tokenizer.tokenize(text) processed_texts.append(t_list) for word in t_list: if lang == "JP": word = word.surface word_vocabulary[word] = word if shuffle == "random": word_vocabulary = shuffle_kv(word_vocabulary) elif shuffle == "shuffle": word_vocabulary_new = {} for k, v in word_vocabulary.items(): list_v = list(v) random.shuffle(list_v) word_vocabulary_new[k] = "".join(list_v) word_vocabulary = word_vocabulary_new else: pass # build data char_vocab = ["</s>"] data_char = numpy.zeros((data_size, MAX_SENTENCE_LENGTH, MAX_WORD_LENGTH), dtype=numpy.int32) # data_char if dict_limit > 0: char_vocab_freq = {"</s>": 2**30} # order by the freq for i, text in enumerate(tqdm(processed_texts)): for j, word in enumerate(text): if lang == "JP": word = word.surface for k, char in enumerate(word_vocabulary[word]): if char not in char_vocab: char_vocab.append(char) char_vocab_freq[char] = 1 else: char_vocab_freq[char] += 1 sorted_char_vocab_freq = sorted(char_vocab_freq.items(), key=lambda x: -x[1]) char_vocab = [k for k, v in sorted_char_vocab_freq] for i, text in enumerate(tqdm(processed_texts)): for j, word in enumerate(text): if lang == "JP": word = word.surface if j < MAX_SENTENCE_LENGTH: for k, char in enumerate(word_vocabulary[word]): if char not in char_vocab: char_vocab.append(char) char_index = len(char_vocab) - 1 else: char_index = char_vocab.index(char) if k < MAX_WORD_LENGTH: if dict_limit == 0 or char_index < dict_limit: data_char[i, j, k] = char_index labels = to_categorical(numpy.asarray(labels)) # split data into training and validation indices = numpy.arange(data_char.shape[0]) numpy.random.shuffle(indices) data_char = data_char[indices] labels = labels[indices] # 80% to train, 10% to validation, 10% to test nb_validation_test_samples = int( (VALIDATION_SPLIT + TEST_SPLIT) * data_char.shape[0]) nb_test_samples = int((TEST_SPLIT) * data_char.shape[0]) x_train = data_char[:-nb_validation_test_samples] y_train = labels[:-nb_validation_test_samples] x_val = data_char[-nb_validation_test_samples:-nb_test_samples] y_val = labels[-nb_validation_test_samples:-nb_test_samples] x_test = data_char[-nb_test_samples:] y_test = labels[-nb_test_samples:] if dict_limit > 0: char_vocab_size = dict_limit else: char_vocab_size = len(char_vocab) return x_train, y_train, x_val, y_val, x_test, y_test, char_vocab_size
from typing import List, Union import janome import spacy from janome.tokenizer import Tokenizer as JanomeTokenizer from torchtext.data.functional import load_sp_model, sentencepiece_tokenizer from src.constants import SENTENCE_PIECE_MODEL_PATH from src.utils import log_decorator janome_tokenizer = JanomeTokenizer() ginza_tokenizer = spacy.load("ja_ginza") sp_model = load_sp_model(SENTENCE_PIECE_MODEL_PATH) sp_tokens_generator = sentencepiece_tokenizer(sp_model) @log_decorator def wakachi_by_sentencepiece(sentence: str) -> List[str]: """SentencePieceを用いた分かち書き Parameters ---------- sentence : str 分かち書きしたい文章 Returns ------- List[str] 分かち書き結果 """
def __init__(self): self._t = JanomeTokenizer()
@it.should('JapaneseTokenizer fit_transform method') def mecab_wakati_test3(case): it.assertListEqual( mecab_wakati_tokenizer.fit_transform(mecab_wakati_test_str), mecab_wakati_test_result) @it.should('CountVectorizer use JapaneseTokenizer') def mecab_wakati_test4(case): it.assertTrue( np.alltrue(mecab_wakati_result0.todense() == mecab_wakati_result1.todense())) with it.having('janome wakati only JapaneseTokenizer test group'): janome_wakati_tokenizer = JapaneseTokenizer('janome') janome_wakati_test_str = 'すもももももももものうち' janome_wakati_test_tokenizer = JanomeTokenizer() janome_wakati_test_result = janome_wakati_test_tokenizer.tokenize( janome_wakati_test_str, wakati=True) janome_wakati_test_count_vectorizer = CountVectorizer( tokenizer=janome_wakati_tokenizer.fit_transform) janome_wakati_vectorizer_test = ['すもももすももももももすもももももももももすももももものうち'] janome_wakati_result0 = janome_wakati_test_count_vectorizer.fit_transform( janome_wakati_vectorizer_test).sorted_indices() janome_wakati_result1 = csr_matrix( ([1, 4, 1, 7, 4], ([0, 0, 0, 0, 0], [0, 1, 2, 3, 4])), shape=(1, 5)) @it.should('JapaneseTokenizer fit method') def janome_wakati__test1(case): it.assertIsInstance( janome_wakati_tokenizer.fit(janome_wakati_test_str),
text = """多くの会社(店)で営業マンに日報を書かせていることと思います。ですが、何のために日報を書かせているのか、もう一度確認してください。営業マンは社外にいる時間が多く、その行動を把握することはできません。そこで営業マンの行動を把握するために、1日の行動記録を日報にして提出させる場合が多いようですが、日報というのは行動記録を書くことなのでしょうか。そして、営業マンに行動記録をかかせることに、どれだけの意味があるのでしょう。例えば、毎日10件の顧客を訪問している営業マンと、毎日5件訪問している営業マンでは、どちらが評価できるでしょうか。きっと、多くのマネジャーが「もちろん10件の顧客を訪問している営業マンに決まっている」と答えるでしょう。しかし、訪問件数を多くすることだけを考え、準備もそこそこに、休む間もなく得意先を回っているかもしれません。 営業マンにとって問題意識をもつことは基本です。""" # 1行1文となっているため、改行コードで分離 sentences = [t for t in text.split('\n')] for i in range(2): print(sentences[i]) # 転職 Advent Calendar 2016 - Qiitaの14日目となります。 少しポエムも含みます。 # 今年11月にSIerからWebサービスの会社へ転職しました。 # 形態素解析器を作る analyzer = Analyzer( [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] for i in range(2): print(corpus[i]) # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。 # 今年 11 月 SIer Web サービス 会社 転職 する from sumy.parsers.plaintext import PlaintextParser
def prepare_word(lang, dict_limit=0): # shuffle characters in each word: word -> dorw (shuffle="shuffle") or randomly swap words (shuffle="random") texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids if lang == "CH": texts, labels, labels_index = get_ChnSenti_texts( "ChnSentiCorp_htl_unba_10000/") elif lang == "JP": janome_tokenizer = JanomeTokenizer() datasize = 10000 texts, labels = get_Rakuten_texts(datasize) data_size = len(texts) processed_texts = [] word_freq = {} # build word_vocabulary for i, text in enumerate(tqdm(texts)): if lang == "CH": t_list = list(jieba.cut(text, cut_all=False)) elif lang == "JP": t_list = janome_tokenizer.tokenize(text) processed_texts.append(t_list) for word in t_list: if lang == "JP": word = word.surface if word not in list(word_freq.keys()): word_freq[word] = 1 else: word_freq[word] += 1 sorted_vocab_freq = sorted(word_freq.items(), key=lambda x: -x[1]) word_vocab = ["</s>"] + [k for k, v in sorted_vocab_freq] data_char = numpy.zeros((data_size, MAX_SENTENCE_LENGTH), dtype=numpy.int32) # data_char for i, text in enumerate(tqdm(processed_texts)): for j, word in enumerate(text): if lang == "JP": word = word.surface word_index = word_vocab.index(word) if j < MAX_SENTENCE_LENGTH: if dict_limit == 0 or word_index < dict_limit: data_char[i, j] = word_index labels = to_categorical(numpy.asarray(labels)) # split data into training and validation indices = numpy.arange(data_char.shape[0]) numpy.random.shuffle(indices) data_char = data_char[indices] labels = labels[indices] # 80% to train, 10% to validation, 10% to test nb_validation_test_samples = int( (VALIDATION_SPLIT + TEST_SPLIT) * data_char.shape[0]) nb_test_samples = int((TEST_SPLIT) * data_char.shape[0]) x_train = data_char[:-nb_validation_test_samples] y_train = labels[:-nb_validation_test_samples] x_val = data_char[-nb_validation_test_samples:-nb_test_samples] y_val = labels[-nb_validation_test_samples:-nb_test_samples] x_test = data_char[-nb_test_samples:] y_test = labels[-nb_test_samples:] if dict_limit > 0: vocab_size = dict_limit else: vocab_size = len(word_vocab) return x_train, y_train, x_val, y_val, x_test, y_test, vocab_size