def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type)

    # === Result ===
    print('===結果===')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print(segmentation_model.sim_arr)
def lexrank(topic, level, node, original_docs, dir='./result/summary/hlda/'):
    n_words = 10
    with_weights = False
    docs = []
    for arr in original_docs:
        tmp_docs = []
        for speaker, remark in arr:
            tmp_docs.append(remark)
        docs.append('\n'.join(tmp_docs))

    tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000)
    docs_for_training = [stems(doc) for doc in docs]
    tfidf.train(docs_for_training)
    sent_vecs = tfidf.to_vector(docs_for_training)
    # 表示
    print('===要約===')
    # 要約
    indexes = summarize(docs,
                        sent_vecs,
                        sort_type='normal',
                        sent_limit=5,
                        threshold=0.1)
    docs_summary = [original_docs[i] for i in indexes]

    path = []
    node_parent = node.parent
    while node_parent is not None:
        path.append(node_parent.node_id)
        node_parent = node_parent.parent
    path.reverse()
    for node_id in path:
        dir += '/topic_' + str(node_id)
    if not (os.path.exists(dir)):
        os.makedirs(dir)
    with open(dir + '/topic_' + str(topic) + '.txt', 'w') as f:
        node_parent = node.parent
        msg = 'topic=%d level=%d (documents=%d): ' % (
            node_parent.node_id, node_parent.level, node_parent.customers)
        msg += node_parent.get_top_words(n_words, with_weights)
        print(msg, file=f)
        msg = '    topic=%d level=%d (documents=%d): ' % (
            node.node_id, node.level, node.customers)
        msg += node.get_top_words(n_words, with_weights)
        print(msg, file=f)
        for node_child in node.children:
            msg = '        topic=%d level=%d (documents=%d): ' % (
                node_child.node_id, node_child.level, node_child.customers)
            msg += node_child.get_top_words(n_words, with_weights)
            print(msg, file=f)
        print('-------------------------------', file=f)
        for i, docs in enumerate(docs_summary):
            print('', file=f)
            print(str(i + 1) + ':', file=f)
            for speaker, remark in docs:
                print(speaker + ' ' + remark, file=f)
Exemple #3
0
def lexrank(original_docs, topic, dictionary, dir='./result/lda/summary/'):
    docs = []
    for doc in original_docs:
        tmp_docs = []
        for speaker, remark in doc:
            tmp_docs.append(remark)
        docs.append('\n'.join(tmp_docs))

    # for training
    tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000)
    docs_for_training = [stems(doc) for doc in docs]
    tfidf.train(docs_for_training)
    sent_vecs = tfidf.to_vector(docs_for_training)

    # for dict
    sw = stopwords()
    docs_for_dict = [stems(doc, polish=True, sw=sw) for doc in docs]
    corpus = list(map(dictionary.doc2bow, docs_for_dict))

    # 表示
    print('===要約===')
    # 要約
    indexes = summarize(docs,
                        sent_vecs,
                        sort_type='normal',
                        sent_limit=10,
                        threshold=0.1)
    docs_summary = [original_docs[i] for i in indexes]
    probs_summary = [lda[corpus[i]] for i in indexes]

    if not (os.path.exists(dir)):
        os.makedirs(dir)
    with open(dir + '/topic_' + str(topic + 1) + '.txt', 'w') as f:
        i = 0
        for docs, prob in zip(docs_summary, probs_summary):
            i += 1
            print("-" * 80, file=f)
            print(str(i) + ':', file=f)
            print([(t + 1, p) for t, p in prob], file=f)
            for speaker, remark in docs:
                print(speaker + ' ' + remark, file=f)
                    tmp_docs = []
                else:
                    tmp_docs.extend([item[1][1]])
            docs.append('\n'.join(tmp_docs))

    if doc_num == 'all':
        doc_num = '26'
    doc_num = '01_' + doc_num

    # Params
    no_below = 3
    no_above = 0.8
    keep_n = 100000
    topic_N = 9
    sw = stopwords()
    docs_for_training = [stems(doc, polish=True, sw=sw) for doc in docs]

    print('===コーパス生成===')
    # tfidf
    # tfidf = TfidfModel(no_below=no_below, no_above=no_above, keep_n=keep_n)
    # tfidf.train(docs_for_training)
    # dictionary = tfidf.dictionary
    # corpus = tfidf.corpus
    # corpus = tfidf.model[corpus]

    dictionary = gensim.corpora.Dictionary(docs_for_training)
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n)
    corpus = list(map(dictionary.doc2bow, docs_for_training))
Exemple #5
0
    print('Done')

    # 要約する単位 文 or 発言
    # to sentence
    if sum_type == 'sentence':
        data = utils.to_sentence(data)

    # for sum
    docs = [row[1] for row in data]
    print(docs[:1])

    if model_type == 'tfidf':
        # GensimのTFIDFモデルを用いた文のベクトル化
        tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000)
        tfidf.load_model()
        sent_vecs = tfidf.to_vector([stems(doc) for doc in docs])

    elif model_type == 'doc2vec':
        # ===Doc2Vec===
        doc2vec = Doc2Vec(alpha=0.025,
                          min_count=10,
                          vector_size=300,
                          epochs=50,
                          workers=4)
        model_path = './model/doc2vec/doc2vec_' + str(
            doc2vec.vector_size) + '.model'
        doc2vec.load_model(model_path)
        sent_vecs = doc2vec.to_vector(([stems(doc) for doc in docs]))

    else:
        print('Invalid model type')
Exemple #6
0
    step = 1

    sw = stopwords()
    # data_set = [stems(doc, polish=True, sw=sw) for doc in docs]
    # docs_for_dict = data_set

    print('===コーパス生成===')
    if eval_type == 'perplexity':
        # Test set
        print(docs[:3])
        random.shuffle(docs)
        print(docs[:3])
        test_size = int(len(docs) * 0.25)
        docs_test = docs[:test_size]
        # docs_test = docs
        test_set = [stems(doc, polish=True, sw=sw) for doc in docs_test]
        # dict
        # data_for_test_dict = [stems(doc, polish=True, sw=sw) for doc in utils.to_sentence_docs(docs_test)]
        data_for_test_dict = test_set
        test_dict = gensim.corpora.Dictionary(data_for_test_dict)
        test_dict.filter_extremes(no_below=no_below,
                                  no_above=no_above,
                                  keep_n=keep_n)
        test_corpus = list(map(test_dict.doc2bow, test_set))

        # Train set
        docs_train = docs[test_size:]
        # docs_train = docs
        train_set = [stems(doc, polish=True, sw=sw) for doc in docs_train]
        # dict
        # data_for_train_dict = [stems(doc, polish=True, sw=sw) for doc in utils.to_sentence_docs(docs_train)]
Exemple #7
0
            update = True
    else:
        print('Arguments are too sort')
        exit()

    model_type = args[1]

    # docs: インタビュー全体
    print('Load data')
    # モデルを訓練する
    data = utils.to_sentence(scraping.scraping(10))
    docs = [row[1] for row in data]

    # max_characters: XX文字以上の単文は要約対象外
    # docs = utils.polish_docs(docs, max_characters=1000)
    docs_for_train = [stems(doc) for doc in docs]
    """
    以下のようなデータを作っています
    edocs_for_train = [
    ['出身は', 'どこ', 'ですか' ...
    ['好き', 'な', '食べもの', ...
    ...
    ]
    """
    print(data[:3])
    print(docs[:1])
    print(docs_for_train[:1])
    print('Done')

    if model_type == 'tfidf':
        # TFIDFモデル生成
def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type,
                                           [stems(doc) for doc in docs])

    # === Result ===
    print('Segmentation')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print('Done')
    # print(res)

    # 画像
    save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(
        segmentation_model.window_size) + '_' + str(datetime.date.today())

    fig = plt.figure()
    plt.ylim([0, 1])
    segmentation_model.sim_arr.plot(title='Cosine similarity')
    plt.savefig(save_path + '.png')
    plt.close('all')

    # セグメント
    # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today())
    # For lda
    save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num
    with open(save_path + '.txt', 'w') as f:
        for i in range(len(docs)):
            print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f)
            print('', file=f)
            if str(i + 0.5) in res.index.values:
                print("___________\n", file=f)

    # === Evaluation ===
    count, f_score = 0, 0
    label_for_eval = []
    if eval:
        print('===評価===')
        count, label_for_eval, f_score = evaluation(res, segmentation_model,
                                                    segmentation_type,
                                                    model_type, doc_type,
                                                    doc_num)

    return count, res.index.values, label_for_eval, f_score
Exemple #9
0
    print('Load data')
    path = './data/test.txt'
    # path = './data/interview-text_01-26_all.txt'
    data = utils.load_data(path)
    data = utils.to_sentence(data)
    docs = [row[1] for row in data]
    print('Done')

    # モデルを訓練する場合
    if train:
        print('===TFIDFモデル生成===')
        print('Train')
        # docs
        # for train
        print(docs[:1])
        tfidf.train([stems(doc) for doc in docs])
        print('Done')

    # 要約する単位 文 or 発言
    print(docs[:1])

    # GensimのTFIDFモデルを用いた文のベクトル化
    sent_vecs = tfidf.to_vector([stems(doc) for doc in docs])

    # print('===セグメンテーション===')
    # TODO
    text_tiling = TextTiling(sent_vecs)

    # with open('./result/segmentation/tfidf/' + str(datetime.date.today()) + '.txt', 'w') as f:
    #     print("no_below: " + str(no_below) + ", no_above: " + str(no_above) + ", keep_n: " + str(keep_n) + ", threshold: " + str(threshold), file=f)
    #     for i, docs in enumerate(docs_summary):