def lexrank(topic, level, node, original_docs, dir='./result/summary/hlda/'):
    n_words = 10
    with_weights = False
    docs = []
    for arr in original_docs:
        tmp_docs = []
        for speaker, remark in arr:
            tmp_docs.append(remark)
        docs.append('\n'.join(tmp_docs))

    tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000)
    docs_for_training = [stems(doc) for doc in docs]
    tfidf.train(docs_for_training)
    sent_vecs = tfidf.to_vector(docs_for_training)
    # 表示
    print('===要約===')
    # 要約
    indexes = summarize(docs,
                        sent_vecs,
                        sort_type='normal',
                        sent_limit=5,
                        threshold=0.1)
    docs_summary = [original_docs[i] for i in indexes]

    path = []
    node_parent = node.parent
    while node_parent is not None:
        path.append(node_parent.node_id)
        node_parent = node_parent.parent
    path.reverse()
    for node_id in path:
        dir += '/topic_' + str(node_id)
    if not (os.path.exists(dir)):
        os.makedirs(dir)
    with open(dir + '/topic_' + str(topic) + '.txt', 'w') as f:
        node_parent = node.parent
        msg = 'topic=%d level=%d (documents=%d): ' % (
            node_parent.node_id, node_parent.level, node_parent.customers)
        msg += node_parent.get_top_words(n_words, with_weights)
        print(msg, file=f)
        msg = '    topic=%d level=%d (documents=%d): ' % (
            node.node_id, node.level, node.customers)
        msg += node.get_top_words(n_words, with_weights)
        print(msg, file=f)
        for node_child in node.children:
            msg = '        topic=%d level=%d (documents=%d): ' % (
                node_child.node_id, node_child.level, node_child.customers)
            msg += node_child.get_top_words(n_words, with_weights)
            print(msg, file=f)
        print('-------------------------------', file=f)
        for i, docs in enumerate(docs_summary):
            print('', file=f)
            print(str(i + 1) + ':', file=f)
            for speaker, remark in docs:
                print(speaker + ' ' + remark, file=f)
Esempio n. 2
0
def lexrank(original_docs, topic, dictionary, dir='./result/lda/summary/'):
    docs = []
    for doc in original_docs:
        tmp_docs = []
        for speaker, remark in doc:
            tmp_docs.append(remark)
        docs.append('\n'.join(tmp_docs))

    # for training
    tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000)
    docs_for_training = [stems(doc) for doc in docs]
    tfidf.train(docs_for_training)
    sent_vecs = tfidf.to_vector(docs_for_training)

    # for dict
    sw = stopwords()
    docs_for_dict = [stems(doc, polish=True, sw=sw) for doc in docs]
    corpus = list(map(dictionary.doc2bow, docs_for_dict))

    # 表示
    print('===要約===')
    # 要約
    indexes = summarize(docs,
                        sent_vecs,
                        sort_type='normal',
                        sent_limit=10,
                        threshold=0.1)
    docs_summary = [original_docs[i] for i in indexes]
    probs_summary = [lda[corpus[i]] for i in indexes]

    if not (os.path.exists(dir)):
        os.makedirs(dir)
    with open(dir + '/topic_' + str(topic + 1) + '.txt', 'w') as f:
        i = 0
        for docs, prob in zip(docs_summary, probs_summary):
            i += 1
            print("-" * 80, file=f)
            print(str(i) + ':', file=f)
            print([(t + 1, p) for t, p in prob], file=f)
            for speaker, remark in docs:
                print(speaker + ' ' + remark, file=f)
def load_model(model_type, segmentation_type, docs):
    if model_type == 'tfidf':
        # TFIDFモデル
        model = TfidfModel(no_below=2, no_above=1.0, keep_n=100000)
        model.train(docs)
    elif model_type == 'doc2vec':
        model = Doc2Vec(alpha=0.025,
                        min_count=10,
                        vector_size=300,
                        epochs=50,
                        workers=4)
        # model.load_model('./model/doc2vec/doc2vec_' + str(model.vector_size) + '.model')
        # model.load_model('./model/doc2vec/doc2vec_wiki.model')
        model.load_model('./model/doc2vec/updated_doc2vec_300.model')
    elif model_type == 'word2vec':
        model = Word2Vec(alpha=0.025,
                         min_count=10,
                         vector_size=200,
                         epochs=50,
                         workers=4)
        model.load_model('./model/word2vec/word2vec_' +
                         str(model.vector_size) + '.model')
        # model.load_model('./model/word2vec/word2vec_wiki.model')
        # model.load_model('./model/word2vec/updated_word2vec_50.model')
    else:
        print('Invalid model type')
        exit()

    if segmentation_type == 'text_tiling':
        segmentation_model = TextTiling(window_size=window_size,
                                        p_limit=0.1,
                                        a=0.5,
                                        model=model)
    elif segmentation_type == 'lcseg':
        segmentation_model = LexicalCohesionSegmentation(
            window_size=window_size,
            hiatus=11,
            p_limit=0.1,
            a=1.0,
            model=model)
    else:
        print('Invalid segment type')
        exit()

    return model, segmentation_model
Esempio n. 4
0
    ['出身は', 'どこ', 'ですか' ...
    ['好き', 'な', '食べもの', ...
    ...
    ]
    """
    print(data[:5])
    print(docs[:5])
    print(docs_for_train[:5])
    print('Done')

    if model_type == 'tfidf':
        # TFIDFモデル生成
        # GensimのTFIDFモデルを用いた文のベクトル化
        print('===TFIDFモデル生成===')
        print('Train tfidf model')
        tfidf = TfidfModel(no_below=3, no_above=0.5, keep_n=100000)
        tfidf.train(docs_for_train)
        tfidf.save_model()
        print('Done')

    elif model_type == 'doc2vec':
        print('===Doc2Vec===')
        label = [row[0] for row in data]
        label_docs = list(range(len(docs)))
        doc2vec = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4)
        if update:
            print('Update doc2vec model')
            label_docs = [False for x in range(len(docs))]
            doc2vec.load_model('./model/doc2vec/doc2vec_wiki.model')
            doc2vec.update(docs_for_train, label_docs)
        else:
Esempio n. 5
0
    data = [(scraping.get_doc(doc_num))]

    print('Done')

    # 要約する単位 文 or 発言
    # to sentence
    if sum_type == 'sentence':
        data = utils.to_sentence(data)

    # for sum
    docs = [row[1] for row in data]
    print(docs[:1])

    if model_type == 'tfidf':
        # GensimのTFIDFモデルを用いた文のベクトル化
        tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000)
        tfidf.load_model()
        sent_vecs = tfidf.to_vector([stems(doc) for doc in docs])

    elif model_type == 'doc2vec':
        # ===Doc2Vec===
        doc2vec = Doc2Vec(alpha=0.025,
                          min_count=10,
                          vector_size=300,
                          epochs=50,
                          workers=4)
        model_path = './model/doc2vec/doc2vec_' + str(
            doc2vec.vector_size) + '.model'
        doc2vec.load_model(model_path)
        sent_vecs = doc2vec.to_vector(([stems(doc) for doc in docs]))
Esempio n. 6
0
    num_levels = 4        # the number of levels in the tree
    display_topics = 100   # the number of iterations between printing a brief summary of the topics so far
    n_words =  5          # the number of most probable words to print for each topic after model estimation
    with_weights = False  # whether to print the words with the weights

    eta_for_path = ''.join(str(eta).split('.'))
    alpha_for_path = str(alpha).split('.')[0]
    path = './model/hlda/level_' + str(num_levels) + '_alpha_' + alpha_for_path + '_eta_' + eta_for_path + '_interview_' + str(doc_num) + '.p'

    hlda = load_zipped_pickle(path)

    level = 2
    res = get_docs_topic(hlda, docs, level=level)
    print(res.keys())
    docs_for_tfidf = []

    # tfidf
    # 他のトピックと比較してTFIDF値算出
    for i, v in res.items():
        docs_for_tfidf.append('\n'.join(v))
    print(len(docs_for_tfidf))

    docs_for_tfidf = [stems(doc) for doc in docs_for_tfidf]
    tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000)
    tfidf.train(docs_for_tfidf)
    dictionary = tfidf.dictionary
    corpus = tfidf.corpus
    corpus_tfidf = tfidf.model[tfidf.corpus]
    dir = './model/tfidf/hlda/levels_' + str(num_levels) + '_alpha_' + alpha_for_path + '_eta_' + eta_for_path + '_interview_' + str(doc_num) + '/level_' + str(level) + '/'
    tfidf.save_model(dir=dir)
Esempio n. 7
0
    ['出身は', 'どこ', 'ですか' ...
    ['好き', 'な', '食べもの', ...
    ...
    ]
    """
    print(data[:3])
    print(docs[:1])
    print(docs_for_train[:1])
    print('Done')

    if model_type == 'tfidf':
        # TFIDFモデル生成
        # GensimのTFIDFモデルを用いた文のベクトル化
        print('===TFIDFモデル生成===')
        print('Train tfidf model')
        tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000)
        tfidf.train(docs_for_train)
        print('Done')

    elif model_type == 'doc2vec':
        print('===Doc2Vec===')
        label = [row[0] for row in data]
        label_docs = list(range(len(docs)))
        doc2vec = Doc2Vec(alpha=0.025,
                          min_count=10,
                          vector_size=300,
                          epochs=50,
                          workers=4)
        if update:
            print('Update doc2vec model')
            label_docs = [False for x in range(len(docs))]
Esempio n. 8
0
from lib.utils import stems
from lib.text_tiling import TextTiling
from lib import utils
import datetime
import sys

if __name__ == '__main__':

    # ハイパーパラメータ
    train = False
    no_below = 10
    no_above = 0.1
    keep_n = 100000

    tfidf = TfidfModel(no_below=no_below,
                       no_above=no_above,
                       keep_n=keep_n,
                       train=train)

    # docs: インタビュー全体
    print('Load data')
    path = './data/test.txt'
    # path = './data/interview-text_01-26_all.txt'
    data = utils.load_data(path)
    data = utils.to_sentence(data)
    docs = [row[1] for row in data]
    print('Done')

    # モデルを訓練する場合
    if train:
        print('===TFIDFモデル生成===')
        print('Train')