def lexrank(topic, level, node, original_docs, dir='./result/summary/hlda/'): n_words = 10 with_weights = False docs = [] for arr in original_docs: tmp_docs = [] for speaker, remark in arr: tmp_docs.append(remark) docs.append('\n'.join(tmp_docs)) tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000) docs_for_training = [stems(doc) for doc in docs] tfidf.train(docs_for_training) sent_vecs = tfidf.to_vector(docs_for_training) # 表示 print('===要約===') # 要約 indexes = summarize(docs, sent_vecs, sort_type='normal', sent_limit=5, threshold=0.1) docs_summary = [original_docs[i] for i in indexes] path = [] node_parent = node.parent while node_parent is not None: path.append(node_parent.node_id) node_parent = node_parent.parent path.reverse() for node_id in path: dir += '/topic_' + str(node_id) if not (os.path.exists(dir)): os.makedirs(dir) with open(dir + '/topic_' + str(topic) + '.txt', 'w') as f: node_parent = node.parent msg = 'topic=%d level=%d (documents=%d): ' % ( node_parent.node_id, node_parent.level, node_parent.customers) msg += node_parent.get_top_words(n_words, with_weights) print(msg, file=f) msg = ' topic=%d level=%d (documents=%d): ' % ( node.node_id, node.level, node.customers) msg += node.get_top_words(n_words, with_weights) print(msg, file=f) for node_child in node.children: msg = ' topic=%d level=%d (documents=%d): ' % ( node_child.node_id, node_child.level, node_child.customers) msg += node_child.get_top_words(n_words, with_weights) print(msg, file=f) print('-------------------------------', file=f) for i, docs in enumerate(docs_summary): print('', file=f) print(str(i + 1) + ':', file=f) for speaker, remark in docs: print(speaker + ' ' + remark, file=f)
def lexrank(original_docs, topic, dictionary, dir='./result/lda/summary/'): docs = [] for doc in original_docs: tmp_docs = [] for speaker, remark in doc: tmp_docs.append(remark) docs.append('\n'.join(tmp_docs)) # for training tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000) docs_for_training = [stems(doc) for doc in docs] tfidf.train(docs_for_training) sent_vecs = tfidf.to_vector(docs_for_training) # for dict sw = stopwords() docs_for_dict = [stems(doc, polish=True, sw=sw) for doc in docs] corpus = list(map(dictionary.doc2bow, docs_for_dict)) # 表示 print('===要約===') # 要約 indexes = summarize(docs, sent_vecs, sort_type='normal', sent_limit=10, threshold=0.1) docs_summary = [original_docs[i] for i in indexes] probs_summary = [lda[corpus[i]] for i in indexes] if not (os.path.exists(dir)): os.makedirs(dir) with open(dir + '/topic_' + str(topic + 1) + '.txt', 'w') as f: i = 0 for docs, prob in zip(docs_summary, probs_summary): i += 1 print("-" * 80, file=f) print(str(i) + ':', file=f) print([(t + 1, p) for t, p in prob], file=f) for speaker, remark in docs: print(speaker + ' ' + remark, file=f)
def load_model(model_type, segmentation_type, docs): if model_type == 'tfidf': # TFIDFモデル model = TfidfModel(no_below=2, no_above=1.0, keep_n=100000) model.train(docs) elif model_type == 'doc2vec': model = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4) # model.load_model('./model/doc2vec/doc2vec_' + str(model.vector_size) + '.model') # model.load_model('./model/doc2vec/doc2vec_wiki.model') model.load_model('./model/doc2vec/updated_doc2vec_300.model') elif model_type == 'word2vec': model = Word2Vec(alpha=0.025, min_count=10, vector_size=200, epochs=50, workers=4) model.load_model('./model/word2vec/word2vec_' + str(model.vector_size) + '.model') # model.load_model('./model/word2vec/word2vec_wiki.model') # model.load_model('./model/word2vec/updated_word2vec_50.model') else: print('Invalid model type') exit() if segmentation_type == 'text_tiling': segmentation_model = TextTiling(window_size=window_size, p_limit=0.1, a=0.5, model=model) elif segmentation_type == 'lcseg': segmentation_model = LexicalCohesionSegmentation( window_size=window_size, hiatus=11, p_limit=0.1, a=1.0, model=model) else: print('Invalid segment type') exit() return model, segmentation_model
['出身は', 'どこ', 'ですか' ... ['好き', 'な', '食べもの', ... ... ] """ print(data[:5]) print(docs[:5]) print(docs_for_train[:5]) print('Done') if model_type == 'tfidf': # TFIDFモデル生成 # GensimのTFIDFモデルを用いた文のベクトル化 print('===TFIDFモデル生成===') print('Train tfidf model') tfidf = TfidfModel(no_below=3, no_above=0.5, keep_n=100000) tfidf.train(docs_for_train) tfidf.save_model() print('Done') elif model_type == 'doc2vec': print('===Doc2Vec===') label = [row[0] for row in data] label_docs = list(range(len(docs))) doc2vec = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4) if update: print('Update doc2vec model') label_docs = [False for x in range(len(docs))] doc2vec.load_model('./model/doc2vec/doc2vec_wiki.model') doc2vec.update(docs_for_train, label_docs) else:
data = [(scraping.get_doc(doc_num))] print('Done') # 要約する単位 文 or 発言 # to sentence if sum_type == 'sentence': data = utils.to_sentence(data) # for sum docs = [row[1] for row in data] print(docs[:1]) if model_type == 'tfidf': # GensimのTFIDFモデルを用いた文のベクトル化 tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000) tfidf.load_model() sent_vecs = tfidf.to_vector([stems(doc) for doc in docs]) elif model_type == 'doc2vec': # ===Doc2Vec=== doc2vec = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4) model_path = './model/doc2vec/doc2vec_' + str( doc2vec.vector_size) + '.model' doc2vec.load_model(model_path) sent_vecs = doc2vec.to_vector(([stems(doc) for doc in docs]))
num_levels = 4 # the number of levels in the tree display_topics = 100 # the number of iterations between printing a brief summary of the topics so far n_words = 5 # the number of most probable words to print for each topic after model estimation with_weights = False # whether to print the words with the weights eta_for_path = ''.join(str(eta).split('.')) alpha_for_path = str(alpha).split('.')[0] path = './model/hlda/level_' + str(num_levels) + '_alpha_' + alpha_for_path + '_eta_' + eta_for_path + '_interview_' + str(doc_num) + '.p' hlda = load_zipped_pickle(path) level = 2 res = get_docs_topic(hlda, docs, level=level) print(res.keys()) docs_for_tfidf = [] # tfidf # 他のトピックと比較してTFIDF値算出 for i, v in res.items(): docs_for_tfidf.append('\n'.join(v)) print(len(docs_for_tfidf)) docs_for_tfidf = [stems(doc) for doc in docs_for_tfidf] tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000) tfidf.train(docs_for_tfidf) dictionary = tfidf.dictionary corpus = tfidf.corpus corpus_tfidf = tfidf.model[tfidf.corpus] dir = './model/tfidf/hlda/levels_' + str(num_levels) + '_alpha_' + alpha_for_path + '_eta_' + eta_for_path + '_interview_' + str(doc_num) + '/level_' + str(level) + '/' tfidf.save_model(dir=dir)
['出身は', 'どこ', 'ですか' ... ['好き', 'な', '食べもの', ... ... ] """ print(data[:3]) print(docs[:1]) print(docs_for_train[:1]) print('Done') if model_type == 'tfidf': # TFIDFモデル生成 # GensimのTFIDFモデルを用いた文のベクトル化 print('===TFIDFモデル生成===') print('Train tfidf model') tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000) tfidf.train(docs_for_train) print('Done') elif model_type == 'doc2vec': print('===Doc2Vec===') label = [row[0] for row in data] label_docs = list(range(len(docs))) doc2vec = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4) if update: print('Update doc2vec model') label_docs = [False for x in range(len(docs))]
from lib.utils import stems from lib.text_tiling import TextTiling from lib import utils import datetime import sys if __name__ == '__main__': # ハイパーパラメータ train = False no_below = 10 no_above = 0.1 keep_n = 100000 tfidf = TfidfModel(no_below=no_below, no_above=no_above, keep_n=keep_n, train=train) # docs: インタビュー全体 print('Load data') path = './data/test.txt' # path = './data/interview-text_01-26_all.txt' data = utils.load_data(path) data = utils.to_sentence(data) docs = [row[1] for row in data] print('Done') # モデルを訓練する場合 if train: print('===TFIDFモデル生成===') print('Train')