def construct_opt_sentences_data(ncode, save_dir_path, short_rate, long_rate,
                                 min_sentence_count, max_sentence_count):
    """
    理想的な文を選択する際に選択される文のインデックスを示す
    また、その時のROUGEスコアと、重要度の閾値も保存する
    {
    opt_sentence_index: np.array,
    threshold: float,
    rouge:
        {
        f: float,
        r: float,
        p: float
        }
    }
    """
    contents_lines = corpus_accessor.get_contents_lines(ncode)
    synopsis_lines = corpus_accessor.get_synopsis_lines(ncode)
    if not contents_lines or not synopsis_lines:
        return
    if not len(contents_lines) > max_sentence_count:
        return

    contents_lines = np.array(contents_lines)
    contents_len = len(''.join(contents_lines))

    # 構築するデータ
    opt_data = dict()

    # 参照あらすじ
    ref = wakati(''.join(synopsis_lines))
    # 用いる要約率の閾値
    rate = long_rate if corpus_accessor.is_long(ncode) else short_rate

    similarity_data = similarity_data_supplier.load(ncode)
    high_score_line_indexes = np.argsort(
        -np.array(list(similarity_data.values())))[:max_sentence_count]

    hyp = contents_lines[high_score_line_indexes[:min_sentence_count]]
    for sentence_index in high_score_line_indexes[min_sentence_count:]:
        if len(''.join(np.append(
                hyp, contents_lines[sentence_index]))) / contents_len < rate:
            hyp = np.append(hyp, contents_lines[sentence_index])
        else:
            break

    opt_data['opt_sentence_index'] = high_score_line_indexes[:len(hyp)]
    opt_data['threshold'] = similarity_data[high_score_line_indexes[len(hyp) -
                                                                    1]]

    hyp = wakati(''.join(hyp))
    score = rouge.get_scores(hyps=hyp, refs=ref, avg=False)[0]['rouge-1']
    opt_data['rouge'] = {'f': score['f'], 'p': score['p'], 'r': score['r']}

    file_path = os.path.join(save_dir_path, ncode + '.txt')
    with open(file_path, 'wb') as f:
        joblib.dump(opt_data, f, compress=1)
Beispiel #2
0
 def test_wakati(self):
     ref = '8月3日に放送された「中居正広の金曜日のスマイルたちへ」(TBS系)で、1日たった5分でぽっこりおなかを解消するという' \
           'ダイエット方法を紹介。キンタロー。のダイエットにも密着。'
     correct = '8月3日 に 放送 さ れ た 「 中居正広の金曜日のスマイルたちへ 」( TBS 系 ) で 、 1日 たった 5分 で ぽっこり' \
               ' おなか を 解消 する という ダイエット方法 を 紹介 。 キンタロー。 の ダイエット に も 密着 。 '
     predict = text_processor.wakati(ref)
     self.assertEqual(predict, correct)
Beispiel #3
0
 def convert_word_embedding_vectors(self, sentence):
     """
     文を文中の単語の文さんベクトルのリストに変換
     """
     wakati_line = text_processor.wakati(sentence).split()
     return [
         self.word_embedding_model.__dict__['wv'][word]
         for word in wakati_line
     ]
Beispiel #4
0
def rouge_between_contents_and_synopsis_sentence(ncode):
    """
    小説本文各文にあらすじ文とのもっとも高いROUGEスコアを付与する
    データは文番号をkey、もっとも高いROUGEをvalueとする辞書で保存される
    [1: ROUGE, 2: ROUGE, ... , n: ROUGE]
    """
    print('[PROCESS NCODE]: {}'.format(ncode))
    file_path = os.path.join(
        ROUGE_BETWEEN_CONTENTS_AND_SYNOPSIS_SENTENCE_DIR_PATH, ncode + '.txt')
    if os.path.isfile(file_path):
        return
    contents_lines = corpus_accessor.get_contents_lines(ncode)
    synopsis_lines = corpus_accessor.get_synopsis_lines(ncode)
    if not contents_lines or not synopsis_lines:
        return

    wakati_contents_lines = [wakati(line) for line in contents_lines]
    wakati_synopsis_lines = [wakati(line) for line in synopsis_lines]

    similarity_dict = dict()
    for line_idx, contents_line in enumerate(wakati_contents_lines):
        if line_idx % 50 == 0:
            print('contents progress: {:.1f}%'.format(
                line_idx / len(contents_lines) * 100))

        if contents_line == '':
            similarity_dict[line_idx] = 0
            continue

        scores = [
            rouge.get_scores(hyps=contents_line, refs=synopsis_line,
                             avg=False)[0]['rouge-1']['r']
            for synopsis_line in wakati_synopsis_lines
            if not synopsis_line == ''
        ]
        if len(scores) == 0:
            similarity_dict[line_idx] = 0
            continue

        max_similarity = max(scores)
        similarity_dict[line_idx] = max_similarity
    print('[INFO] saving data: {}'.format(ncode))
    with open(file_path, 'wb') as f:
        joblib.dump(similarity_dict, f, compress=3)
 def convert_avg_vector(self, line):
     """
     文を文中の各単語の平均ベクトルに変換
     """
     if self.word_embedding_model is None:
         raise ValueError("there is not word embedding model")
     wakati_line = text_processor.wakati(line).split()
     word_vectors = np.array([
         self.word_embedding_model.__dict__['wv'][word]
         for word in wakati_line
     ])
     return np.average(word_vectors, axis=0)
Beispiel #6
0
def multi_generate(importance, start, end):
    """
    複数作品まとめて確認したいとき
    """
    corpus_accessor = CorpusAccessor()
    output_file_path = 'result_start_' + str(start) + '_end_' + str(
        end) + '.txt'
    file = open(output_file_path, 'w')

    love_story_s = LSTMSummarizer()
    love_story_supplier = LSTMVectorSupplier(
        'love_story',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    love_story_s.set_supplier(love_story_supplier)
    love_story_s.set_trained_model()

    fantasy_s = LSTMSummarizer()
    fantasy_supplier = LSTMVectorSupplier(
        'fantasy',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    fantasy_s.set_supplier(fantasy_supplier)
    fantasy_s.set_trained_model()

    literature_s = LSTMSummarizer()
    literature_supplier = LSTMVectorSupplier(
        'literature',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    literature_s.set_supplier(literature_supplier)
    literature_s.set_trained_model()

    sf_s = LSTMSummarizer()
    sf_supplier = LSTMVectorSupplier('sf',
                                     importance,
                                     use_data_of_position_of_sentence=True,
                                     use_data_of_is_serif=True,
                                     use_data_of_is_include_person=True,
                                     use_data_of_sentence_length=True)
    sf_s.set_supplier(sf_supplier)
    sf_s.set_trained_model()

    # sys.setrecursionlimit(20000)
    rouge = Rouge()

    for i, ncode in enumerate(corpus_accessor.exist_ncodes[start:end]):
        print('processed ncode count: ', i)

        genre = corpus_accessor.get_genre(ncode)
        if len(genre) == 0:
            print('non genre')
            continue
        ref = ''.join(corpus_accessor.get_synopsis_lines(ncode))

        synopsis = ''
        if genre == 'love_story':
            synopsis = love_story_s.generate(ncode)
        elif genre == 'fantasy':
            synopsis = fantasy_s.generate(ncode)
        elif genre == 'literature':
            synopsis = literature_s.generate(ncode)
        elif genre == 'sf':
            synopsis = sf_s.generate(ncode)

        score = rouge.get_scores(wakati(synopsis), wakati(ref),
                                 False)[0]['rouge-1']['r']

        file.write(ncode + '\n')
        file.write(genre + '\n')
        file.write('score: ' + str(score) + '\n')
        file.write(ref + '\n\n')
        file.write(synopsis + '\n\n\n')
    file.close()
Beispiel #7
0
def evaluate(genre='general',
             importance='cos_sim',
             use_data_of_position_of_sentence=False,
             use_data_of_is_serif=False,
             use_data_of_is_include_person=False,
             use_data_of_sentence_length=False):
    dnn_summarizer = DNNSummarizer()
    dnn_vector_supplier = DNNVectorSupplier(genre,
                                 importance,
                                 use_data_of_position_of_sentence=use_data_of_position_of_sentence,
                                 use_data_of_is_serif=use_data_of_is_serif,
                                 use_data_of_is_include_person=use_data_of_is_include_person,
                                 use_data_of_sentence_length=use_data_of_sentence_length)
    dnn_summarizer.set_supplier(dnn_vector_supplier)
    dnn_summarizer.set_trained_model()

    lstm_summarizer = LSTMSummarizer()
    lstm_vector_supplier = LSTMVectorSupplier(genre,
                                  importance,
                                  use_data_of_position_of_sentence=use_data_of_position_of_sentence,
                                  use_data_of_is_serif=use_data_of_is_serif,
                                  use_data_of_is_include_person=use_data_of_is_include_person,
                                  use_data_of_sentence_length=use_data_of_sentence_length)
    lstm_summarizer.set_supplier(lstm_vector_supplier)
    lstm_summarizer.set_trained_model()

    test_ncodes = dnn_vector_supplier.test_ncodes
    total = len(test_ncodes)
    print('[INFO] test ncodes count: ', total)

    # ROUGE-1
    opt_rouge_one_scores = []               # 類似度上位から文選択(理論上の上限値)
    lead_rouge_one_scores = []              # 文章の先頭からoptの文数分選択
    random_rouge_one_scores = []            # ランダムに文を選択
    dnn_rouge_one_scores = []               # DNNによるあらすじ
    lstm_rouge_one_scores = []              # LSTMによるあらすじ

    # ROUGE-2
    opt_rouge_two_scores = []               # 類似度上位から文選択(理論上の上限値)
    lead_rouge_two_scores = []              # 文章の先頭からoptの文数分選択
    random_rouge_two_scores = []            # ランダムに文を選択
    dnn_rouge_two_scores = []               # DNNによるあらすじ
    lstm_rouge_two_scores = []              # LSTMによるあらすじ

    sys.setrecursionlimit(20000)
    rouge = Rouge()
    summarization_rates = []
    for i, ncode in enumerate(test_ncodes):
        contents = ''.join(corpus_accessor.get_contents_lines(ncode))
        ref = ''.join(corpus_accessor.get_synopsis_lines(ncode))
        summarization_rates.append(len(ref) / len(contents))

        print('[INFO] processing ncode: ', ncode)
        print('[INFO] progress: {:.1f}%'.format(i / total * 100))

        ref = wakati(ref)
        opt = wakati(opt_synopsis.generate(ncode, importance))
        lead = wakati(lead_synopsis.generate(ncode))
        random = wakati(random_synopsis.generate(ncode))
        dnn_hyp = wakati(dnn_summarizer.generate(ncode))
        lstm_hyp = wakati(lstm_summarizer.generate(ncode))

        opt_score = rouge.get_scores(opt, ref, False)
        lead_score = rouge.get_scores(lead, ref, False)
        random_score = rouge.get_scores(random, ref, False)
        dnn_score = rouge.get_scores(dnn_hyp, ref, False)
        lstm_score = rouge.get_scores(lstm_hyp, ref, False)

        opt_rouge_one_scores.append(opt_score[0]['rouge-1']['r'])
        lead_rouge_one_scores.append(lead_score[0]['rouge-1']['r'])
        random_rouge_one_scores.append(random_score[0]['rouge-1']['r'])
        dnn_rouge_one_scores.append(dnn_score[0]['rouge-1']['r'])
        lstm_rouge_one_scores.append(lstm_score[0]['rouge-1']['r'])

        opt_rouge_two_scores.append(opt_score[0]['rouge-2']['r'])
        lead_rouge_two_scores.append(lead_score[0]['rouge-2']['r'])
        random_rouge_two_scores.append(random_score[0]['rouge-2']['r'])
        dnn_rouge_two_scores.append(dnn_score[0]['rouge-2']['r'])
        lstm_rouge_two_scores.append(lstm_score[0]['rouge-2']['r'])

    print('[RESULT] genre: ', genre)
    print('ROUGE-1')
    print('opt: {}'.format(np.average(opt_rouge_one_scores)))
    print('lead: {}'.format(np.average(lead_rouge_one_scores)))
    print('random: {}'.format(np.average(random_rouge_one_scores)))
    print('dnn: {}'.format(np.average(dnn_rouge_one_scores)))
    print('lstm: {}'.format(np.average(lstm_rouge_one_scores)))
    print('\n')

    print('ROUGE-2')
    print('opt: {}'.format(np.average(opt_rouge_two_scores)))
    print('lead: {}'.format(np.average(lead_rouge_two_scores)))
    print('random: {}'.format(np.average(random_rouge_two_scores)))
    print('dnn: {}'.format(np.average(dnn_rouge_two_scores)))
    print('lstm: {}'.format(np.average(lstm_rouge_two_scores)))
    print('\n')

    print('Summarization Rate')
    print(np.average(summarization_rates))
 def convert_index_list(self, line):
     if self.word_embedding_model is None:
         raise ValueError("there is not word embedding model")
     words = text_processor.wakati(line).split()
     index_list = [self.word_embedding_model.wv.vocab[word].index + 1 for word in words]
     return index_list