class WordEmbeddingAvgVectorConstructor: def __init__(self): self.data_accessor = CorpusAccessor() if os.path.isfile(WORD_EMBEDDING_MODEL_PATH): print('[INFO] loading word embedding model...') self.word_embedding_model = word2vec.Word2Vec.load( WORD_EMBEDDING_MODEL_PATH) def convert_avg_vector(self, line): """ 文を文中の各単語の平均ベクトルに変換 """ if self.word_embedding_model is None: raise ValueError("there is not word embedding model") wakati_line = text_processor.wakati(line).split() word_vectors = np.array([ self.word_embedding_model.__dict__['wv'][word] for word in wakati_line ]) return np.average(word_vectors, axis=0) def sentence_to_word_embedding_avg_vector(self, ncode): """ 小説本文とあらすじ文の各文を、文中における各単語の分散表現の平均ベクトルに変換する データは文番号をkey、文ベクトルをvalueとする辞書で保存される [1: 文ベクトル, 2: 文ベクトル, ... , n: 文ベクトル] """ print('[PROCESS NCODE]: {}'.format(ncode)) contents_file_path = os.path.join( WORD_EMBEDDING_AVG_VECTOR_CONTENTS_PATH, ncode + '.txt') if os.path.isfile(contents_file_path): return contents_lines = self.data_accessor.get_contents_lines(ncode) synopsis_lines = self.data_accessor.get_synopsis_lines(ncode) if not contents_lines or not synopsis_lines: return # 本文各文のベクトル化 contents_line_vectors = dict() for line_idx, line in enumerate(contents_lines): if line_idx % 50 == 0: print('contents progress: {:.1f}%'.format( line_idx / len(contents_lines) * 100)) vector = self.convert_avg_vector(line) contents_line_vectors[line_idx] = vector # データの保存 print('[INFO] saving data: {}'.format(ncode)) with open(contents_file_path, 'wb') as cf: joblib.dump(contents_line_vectors, cf, compress=3) def construct(self): for i, ncode in enumerate(self.data_accessor.ncodes): print('[INFO] num of constructed data: {}'.format(i)) self.sentence_to_word_embedding_avg_vector(ncode)
class WordEmbeddingVectorConstructor: def __init__(self): self.data_accessor = CorpusAccessor() if os.path.isfile(WORD_EMBEDDING_MODEL_PATH): print('[INFO] loading word embedding model...') self.word_embedding_model = word2vec.Word2Vec.load( WORD_EMBEDDING_MODEL_PATH) def convert_word_embedding_vectors(self, sentence): """ 文を文中の単語の文さんベクトルのリストに変換 """ wakati_line = text_processor.wakati(sentence).split() return [ self.word_embedding_model.__dict__['wv'][word] for word in wakati_line ] def sentence_to_word_embedding_vectors(self, ncode): """ 小説本文を、文中における各単語の分散表現のベクトルのリストに変換する データは文番号をkey、文ベクトルをvalueとする辞書で保存される [1: tensor, 2: tensor, ... , n: tensor] """ print('[PROCESS NCODE]: {}'.format(ncode)) contents_lines = self.data_accessor.get_contents_lines(ncode) synopsis_lines = self.data_accessor.get_synopsis_lines(ncode) if not contents_lines or not synopsis_lines: return # 本文各文のベクトル化 contents_line_vectors = dict() for line_idx, line in enumerate(contents_lines): if line_idx % 50 == 0: print('contents progress: {:.1f}%'.format( line_idx / len(contents_lines) * 100)) tensor = self.convert_word_embedding_vectors(line) contents_line_vectors[line_idx] = tensor # データの保存 contents_file_path = os.path.join(WORD_EMBEDDING_VECTORS_CONTENTS_PATH, ncode + '.txt') print('[INFO] saving data: {}'.format(ncode)) with open(contents_file_path, 'wb') as cf: joblib.dump(contents_line_vectors, cf, compress=3) def construct(self): """ 全小説のデータを構築する """ for i, ncode in enumerate(self.data_accessor.ncodes): print('[INFO] num of constructed data: {}'.format(i)) self.sentence_to_word_embedding_vectors(ncode)
class WordIndexesConstructor(): def __init__(self): self.data_accessor = CorpusAccessor() if os.path.isfile(WORD_EMBEDDING_MODEL_PATH): self.word_embedding_model = word2vec.Word2Vec.load(WORD_EMBEDDING_MODEL_PATH) def convert_index_list(self, line): if self.word_embedding_model is None: raise ValueError("there is not word embedding model") words = text_processor.wakati(line).split() index_list = [self.word_embedding_model.wv.vocab[word].index + 1 for word in words] return index_list def sentence_to_word_indexes(self, ncode): """ 小説本文各文を、文中の各単語のインデックスのリストに変換する データは文番号をkey、インデックスのリストをvalueとする辞書で保存される [1: list, 2: list, ... , n: list] """ print('[PROCESS NCODE]: {}'.format(ncode)) contents_file_path = os.path.join(WORD_INDEXES_CONTENTS_PATH, ncode + '.txt') if os.path.isfile(contents_file_path): return contents_lines = self.data_accessor.get_contents_lines(ncode) synopsis_lines = self.data_accessor.get_synopsis_lines(ncode) if not contents_lines or not synopsis_lines: return index_data = dict() for line_idx, line in enumerate(contents_lines): if line_idx % 50 == 0: print('contents progress: {:.1f}%'.format(line_idx / len(contents_lines) * 100)) index_list = self.convert_index_list(line) index_data[line_idx] = index_list # データの保存 print('[INFO] saving data: {}'.format(ncode)) with open(contents_file_path, 'wb') as cf: joblib.dump(index_data, cf, compress=3) def construct(self): """ 全小説のデータを構築する """ for i, ncode in enumerate(self.data_accessor.ncodes): print('[INFO] num of constructed data: {}'.format(i)) self.sentence_to_word_indexes(ncode)
def generate(importance, ncode, position=False, serif=False, person=False, sentence_length=False): corpus_accessor = CorpusAccessor() genre = corpus_accessor.get_genre(ncode) if len(genre) == 0: print('non genre') return s = LSTMSummarizer() supplier = LSTMVectorSupplier(genre, importance, use_data_of_position_of_sentence=position, use_data_of_is_serif=serif, use_data_of_is_include_person=person, use_data_of_sentence_length=sentence_length) s.set_supplier(supplier) s.set_trained_model() print(s.generate(ncode=ncode))
def __init__(self): self.data_accessor = CorpusAccessor() if os.path.isfile(WORD_EMBEDDING_MODEL_PATH): print('[INFO] loading word embedding model...') self.word_embedding_model = word2vec.Word2Vec.load( WORD_EMBEDDING_MODEL_PATH)
import numpy as np from util.corpus_accessor import CorpusAccessor corpus_accessor = CorpusAccessor() def generate(ncode): """ 正解データのスコアの大きい順に、参照あらすじと近い文字数のあらすじを生成する """ contents_lines = corpus_accessor.get_contents_lines(ncode) synopsis_lines = corpus_accessor.get_synopsis_lines(ncode) if not contents_lines or not synopsis_lines: return contents_lines = np.array(contents_lines) # 参照あらすじの長さ ref_length = len(''.join(synopsis_lines)) # 最低文数 min_sentence_count = 1 random_line_indexes = np.random.permutation(np.arange(len(contents_lines))) hyp = contents_lines[random_line_indexes[:min_sentence_count]] for sentence_index in random_line_indexes[min_sentence_count:]: if len(''.join(np.append(hyp, contents_lines[sentence_index]))) <= ref_length: hyp = np.append(hyp, contents_lines[sentence_index]) else: break return ''.join(hyp)
def multi_generate(importance, start, end): """ 複数作品まとめて確認したいとき """ corpus_accessor = CorpusAccessor() output_file_path = 'result_start_' + str(start) + '_end_' + str( end) + '.txt' file = open(output_file_path, 'w') love_story_s = LSTMSummarizer() love_story_supplier = LSTMVectorSupplier( 'love_story', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) love_story_s.set_supplier(love_story_supplier) love_story_s.set_trained_model() fantasy_s = LSTMSummarizer() fantasy_supplier = LSTMVectorSupplier( 'fantasy', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) fantasy_s.set_supplier(fantasy_supplier) fantasy_s.set_trained_model() literature_s = LSTMSummarizer() literature_supplier = LSTMVectorSupplier( 'literature', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) literature_s.set_supplier(literature_supplier) literature_s.set_trained_model() sf_s = LSTMSummarizer() sf_supplier = LSTMVectorSupplier('sf', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) sf_s.set_supplier(sf_supplier) sf_s.set_trained_model() # sys.setrecursionlimit(20000) rouge = Rouge() for i, ncode in enumerate(corpus_accessor.exist_ncodes[start:end]): print('processed ncode count: ', i) genre = corpus_accessor.get_genre(ncode) if len(genre) == 0: print('non genre') continue ref = ''.join(corpus_accessor.get_synopsis_lines(ncode)) synopsis = '' if genre == 'love_story': synopsis = love_story_s.generate(ncode) elif genre == 'fantasy': synopsis = fantasy_s.generate(ncode) elif genre == 'literature': synopsis = literature_s.generate(ncode) elif genre == 'sf': synopsis = sf_s.generate(ncode) score = rouge.get_scores(wakati(synopsis), wakati(ref), False)[0]['rouge-1']['r'] file.write(ncode + '\n') file.write(genre + '\n') file.write('score: ' + str(score) + '\n') file.write(ref + '\n\n') file.write(synopsis + '\n\n\n') file.close()
import os import joblib import numpy as np from gensim.models import word2vec import data_supplier from data_supplier.active_ncodes_supplier import ncodes_train_test_split from util.paths import LSTM_TRAINED_MODEL_DIR_PATH from util.corpus_accessor import CorpusAccessor from util.paths import WORD_EMBEDDING_MODEL_PATH from util import text_processor data_accessor = CorpusAccessor() class LSTMVectorSupplier: def __init__(self, genre='general', importance='cos_sim', use_data_of_position_of_sentence=False, use_data_of_is_serif=False, use_data_of_is_include_person=False, use_data_of_sentence_length=False): """ このクラスの初期化時に使用する素性を指定する :param word_embedding_avg_vector: bool 単語の分散表現ベクトルの平均ベクトル :param position_of_sentence: bool 文の出現位置 :param is_serif: bool セリフか否か