class WordEmbeddingAvgVectorConstructor:
    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            print('[INFO] loading word embedding model...')
            self.word_embedding_model = word2vec.Word2Vec.load(
                WORD_EMBEDDING_MODEL_PATH)

    def convert_avg_vector(self, line):
        """
        文を文中の各単語の平均ベクトルに変換
        """
        if self.word_embedding_model is None:
            raise ValueError("there is not word embedding model")
        wakati_line = text_processor.wakati(line).split()
        word_vectors = np.array([
            self.word_embedding_model.__dict__['wv'][word]
            for word in wakati_line
        ])
        return np.average(word_vectors, axis=0)

    def sentence_to_word_embedding_avg_vector(self, ncode):
        """
        小説本文とあらすじ文の各文を、文中における各単語の分散表現の平均ベクトルに変換する
        データは文番号をkey、文ベクトルをvalueとする辞書で保存される
        [1: 文ベクトル, 2: 文ベクトル, ... , n: 文ベクトル]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_file_path = os.path.join(
            WORD_EMBEDDING_AVG_VECTOR_CONTENTS_PATH, ncode + '.txt')
        if os.path.isfile(contents_file_path):
            return

        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        # 本文各文のベクトル化
        contents_line_vectors = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(
                    line_idx / len(contents_lines) * 100))
            vector = self.convert_avg_vector(line)
            contents_line_vectors[line_idx] = vector

        # データの保存
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(contents_line_vectors, cf, compress=3)

    def construct(self):
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_embedding_avg_vector(ncode)
Example #2
0
class WordEmbeddingVectorConstructor:
    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            print('[INFO] loading word embedding model...')
            self.word_embedding_model = word2vec.Word2Vec.load(
                WORD_EMBEDDING_MODEL_PATH)

    def convert_word_embedding_vectors(self, sentence):
        """
        文を文中の単語の文さんベクトルのリストに変換
        """
        wakati_line = text_processor.wakati(sentence).split()
        return [
            self.word_embedding_model.__dict__['wv'][word]
            for word in wakati_line
        ]

    def sentence_to_word_embedding_vectors(self, ncode):
        """
        小説本文を、文中における各単語の分散表現のベクトルのリストに変換する
        データは文番号をkey、文ベクトルをvalueとする辞書で保存される
        [1: tensor, 2: tensor, ... , n: tensor]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        # 本文各文のベクトル化
        contents_line_vectors = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(
                    line_idx / len(contents_lines) * 100))
            tensor = self.convert_word_embedding_vectors(line)
            contents_line_vectors[line_idx] = tensor

        # データの保存
        contents_file_path = os.path.join(WORD_EMBEDDING_VECTORS_CONTENTS_PATH,
                                          ncode + '.txt')
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(contents_line_vectors, cf, compress=3)

    def construct(self):
        """
        全小説のデータを構築する
        """
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_embedding_vectors(ncode)
class WordIndexesConstructor():

    def __init__(self):
        self.data_accessor = CorpusAccessor()
        if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
            self.word_embedding_model = word2vec.Word2Vec.load(WORD_EMBEDDING_MODEL_PATH)

    def convert_index_list(self, line):
        if self.word_embedding_model is None:
            raise ValueError("there is not word embedding model")
        words = text_processor.wakati(line).split()
        index_list = [self.word_embedding_model.wv.vocab[word].index + 1 for word in words]
        return index_list


    def sentence_to_word_indexes(self, ncode):
        """
        小説本文各文を、文中の各単語のインデックスのリストに変換する
        データは文番号をkey、インデックスのリストをvalueとする辞書で保存される
        [1: list, 2: list, ... , n: list]
        """
        print('[PROCESS NCODE]: {}'.format(ncode))
        contents_file_path = os.path.join(WORD_INDEXES_CONTENTS_PATH, ncode + '.txt')
        if os.path.isfile(contents_file_path):
            return

        contents_lines = self.data_accessor.get_contents_lines(ncode)
        synopsis_lines = self.data_accessor.get_synopsis_lines(ncode)
        if not contents_lines or not synopsis_lines:
            return

        index_data = dict()
        for line_idx, line in enumerate(contents_lines):
            if line_idx % 50 == 0:
                print('contents progress: {:.1f}%'.format(line_idx / len(contents_lines) * 100))
            index_list = self.convert_index_list(line)
            index_data[line_idx] = index_list

        # データの保存
        print('[INFO] saving data: {}'.format(ncode))
        with open(contents_file_path, 'wb') as cf:
            joblib.dump(index_data, cf, compress=3)

    def construct(self):
        """
        全小説のデータを構築する
        """
        for i, ncode in enumerate(self.data_accessor.ncodes):
            print('[INFO] num of constructed data: {}'.format(i))
            self.sentence_to_word_indexes(ncode)
Example #4
0
def generate(importance,
             ncode,
             position=False,
             serif=False,
             person=False,
             sentence_length=False):
    corpus_accessor = CorpusAccessor()
    genre = corpus_accessor.get_genre(ncode)
    if len(genre) == 0:
        print('non genre')
        return
    s = LSTMSummarizer()
    supplier = LSTMVectorSupplier(genre,
                                  importance,
                                  use_data_of_position_of_sentence=position,
                                  use_data_of_is_serif=serif,
                                  use_data_of_is_include_person=person,
                                  use_data_of_sentence_length=sentence_length)
    s.set_supplier(supplier)
    s.set_trained_model()
    print(s.generate(ncode=ncode))
 def __init__(self):
     self.data_accessor = CorpusAccessor()
     if os.path.isfile(WORD_EMBEDDING_MODEL_PATH):
         print('[INFO] loading word embedding model...')
         self.word_embedding_model = word2vec.Word2Vec.load(
             WORD_EMBEDDING_MODEL_PATH)
import numpy as np

from util.corpus_accessor import CorpusAccessor

corpus_accessor = CorpusAccessor()

def generate(ncode):
    """
    正解データのスコアの大きい順に、参照あらすじと近い文字数のあらすじを生成する
    """
    contents_lines = corpus_accessor.get_contents_lines(ncode)
    synopsis_lines = corpus_accessor.get_synopsis_lines(ncode)
    if not contents_lines or not synopsis_lines:
        return

    contents_lines = np.array(contents_lines)
    # 参照あらすじの長さ
    ref_length = len(''.join(synopsis_lines))
    # 最低文数
    min_sentence_count = 1

    random_line_indexes = np.random.permutation(np.arange(len(contents_lines)))

    hyp = contents_lines[random_line_indexes[:min_sentence_count]]
    for sentence_index in random_line_indexes[min_sentence_count:]:
        if len(''.join(np.append(hyp, contents_lines[sentence_index]))) <= ref_length:
            hyp = np.append(hyp, contents_lines[sentence_index])
        else:
            break
    return ''.join(hyp)
Example #7
0
def multi_generate(importance, start, end):
    """
    複数作品まとめて確認したいとき
    """
    corpus_accessor = CorpusAccessor()
    output_file_path = 'result_start_' + str(start) + '_end_' + str(
        end) + '.txt'
    file = open(output_file_path, 'w')

    love_story_s = LSTMSummarizer()
    love_story_supplier = LSTMVectorSupplier(
        'love_story',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    love_story_s.set_supplier(love_story_supplier)
    love_story_s.set_trained_model()

    fantasy_s = LSTMSummarizer()
    fantasy_supplier = LSTMVectorSupplier(
        'fantasy',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    fantasy_s.set_supplier(fantasy_supplier)
    fantasy_s.set_trained_model()

    literature_s = LSTMSummarizer()
    literature_supplier = LSTMVectorSupplier(
        'literature',
        importance,
        use_data_of_position_of_sentence=True,
        use_data_of_is_serif=True,
        use_data_of_is_include_person=True,
        use_data_of_sentence_length=True)
    literature_s.set_supplier(literature_supplier)
    literature_s.set_trained_model()

    sf_s = LSTMSummarizer()
    sf_supplier = LSTMVectorSupplier('sf',
                                     importance,
                                     use_data_of_position_of_sentence=True,
                                     use_data_of_is_serif=True,
                                     use_data_of_is_include_person=True,
                                     use_data_of_sentence_length=True)
    sf_s.set_supplier(sf_supplier)
    sf_s.set_trained_model()

    # sys.setrecursionlimit(20000)
    rouge = Rouge()

    for i, ncode in enumerate(corpus_accessor.exist_ncodes[start:end]):
        print('processed ncode count: ', i)

        genre = corpus_accessor.get_genre(ncode)
        if len(genre) == 0:
            print('non genre')
            continue
        ref = ''.join(corpus_accessor.get_synopsis_lines(ncode))

        synopsis = ''
        if genre == 'love_story':
            synopsis = love_story_s.generate(ncode)
        elif genre == 'fantasy':
            synopsis = fantasy_s.generate(ncode)
        elif genre == 'literature':
            synopsis = literature_s.generate(ncode)
        elif genre == 'sf':
            synopsis = sf_s.generate(ncode)

        score = rouge.get_scores(wakati(synopsis), wakati(ref),
                                 False)[0]['rouge-1']['r']

        file.write(ncode + '\n')
        file.write(genre + '\n')
        file.write('score: ' + str(score) + '\n')
        file.write(ref + '\n\n')
        file.write(synopsis + '\n\n\n')
    file.close()
import os
import joblib
import numpy as np
from gensim.models import word2vec

import data_supplier
from data_supplier.active_ncodes_supplier import ncodes_train_test_split
from util.paths import LSTM_TRAINED_MODEL_DIR_PATH
from util.corpus_accessor import CorpusAccessor
from util.paths import WORD_EMBEDDING_MODEL_PATH
from util import text_processor

data_accessor = CorpusAccessor()


class LSTMVectorSupplier:
    def __init__(self,
                 genre='general',
                 importance='cos_sim',
                 use_data_of_position_of_sentence=False,
                 use_data_of_is_serif=False,
                 use_data_of_is_include_person=False,
                 use_data_of_sentence_length=False):
        """
        このクラスの初期化時に使用する素性を指定する
        :param word_embedding_avg_vector: bool
        単語の分散表現ベクトルの平均ベクトル
        :param position_of_sentence: bool
        文の出現位置
        :param is_serif: bool
        セリフか否か