def test_run(data_path, model_path, n_show=10): """ Test function. Input is training data. Output have to be the sentence which is correct data in training phase. :return: """ corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run an interpreter for num, input_sentence in enumerate(corpus.rough_posts): id_sequence = input_sentence.copy() input_sentence.reverse() input_sentence.insert(0, corpus.dic.token2id["<eos>"]) #print(input_sentence) model.initialize() # initialize cell sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic) print("teacher : ", " ".join([corpus.dic[w_id] for w_id in id_sequence])) print("correct :", " ".join([corpus.dic[w_id] for w_id in corpus.rough_cmnts[num]])) print("-> ", sentence) print('') if num == n_show: break
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] batch_num = text_num = max_input_ren = max_output_ren = 0 if not os.path.exists('./data/corpus/input_mat0.npy'): for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts): # convert to list input_text.reverse() # encode words in a reverse order input_text.insert(0, corpus.dic.token2id["<eos>"]) output_text.append(corpus.dic.token2id["<eos>"]) # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) input_mat.append(input_text) output_mat.append(output_text) batch_num += 1 if batch_num % 10000 == 0: # padding for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) # create batch matrix input_mat = np.array(input_mat, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T # save matrix and free memory print('save data ... number', text_num) np.save('./data/corpus/input_mat' + str(text_num) + '.npy', input_mat) np.save('./data/corpus/output_mat' + str(text_num) + '.npy', output_mat) text_num += 1 del input_mat del output_mat gc.collect() input_mat = [] output_mat = [] else: print( 'You already have matrix files! ' 'If you remake new corpus, you should remove old files in "data/corpus" directory and run this script.' )
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): if args.lang == 'ja': corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) else: corpus = ConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: if args.lang == 'ja': corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) else: corpus = ConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('Emotion size: ', len(corpus.emotion_set)) # search word_threshold (general - emotional) ma = 0 mi = 999999 for word in corpus.emotion_set: wid = corpus.dic.token2id[word] if wid > ma: ma = wid if wid < mi: mi = wid # print(corpus.dic.token2id['<start>'], corpus.dic.token2id['<eos>'], corpus.dic.token2id['happy'], mi, ma) word_threshold = mi ###################### #### create model #### ###################### model = Seq2Seq(all_vocab_size=len(corpus.dic.token2id), emotion_vocab_size=len(corpus.emotion_set), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, label_num=label_num, label_embed_num=label_embed, gpu_flg=args.gpu) if args.gpu >= 0: model.to_gpu() optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] input_mat_rev = [] label_mat = [] max_input_ren = max_output_ren = 0 print('start making corpus matrix...') for input_text, output_text in zip(corpus.rough_posts, corpus.rough_cmnts): # reverse an input and add eos tag output_text.append(corpus.dic.token2id["<eos>"]) # 出力の最後にeosを挿入 # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) # make a list of lists input_mat.append(input_text) output_mat.append(output_text) # make label lists TODO: 3値分類 n_num = p_num = 0 for word in output_text: if corpus.dic[word] in corpus.neg_words: n_num += 1 if corpus.dic[word] in corpus.pos_words: p_num += 1 if (n_num + p_num) == 0: label_mat.append([1 for _ in range(len(output_text))]) elif n_num <= p_num: label_mat.append([2 for _ in range(len(output_text))]) elif n_num > p_num: label_mat.append([0 for _ in range(len(output_text))]) else: raise ValueError # make reverse corpus for input_text in input_mat: input_mat_rev.append(input_text[::-1]) # padding (inputの文頭・outputの文末にパディングを挿入する) print('start labeling...') for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) for li in input_mat_rev: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in label_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) if len(output_mat) != len(label_mat): print('Output matrix and label matrix should have the same dimension.') raise ValueError # create batch matrix print('transpose...') input_mat = np.array(input_mat, dtype=np.int32).T input_mat_rev = np.array(input_mat_rev, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T label_mat = np.array(label_mat, dtype=np.int32).T # separate corpus into Train and Test TODO:実験時はテストデータとトレーニングデータに分離する print('split train and test...') train_input_mat = input_mat train_output_mat = output_mat train_input_mat_rev = input_mat_rev train_label_mat = label_mat ############################# #### train seq2seq model #### ############################# accum_loss = 0 train_loss_data = [] print('start training...') for num, epoch in enumerate(range(n_epoch)): total_loss = 0 batch_num = 0 perm = np.random.permutation(len(corpus.rough_posts)) # for training for i in range(0, len(corpus.rough_posts), batchsize): # select batch data input_batch = remove_extra_padding( train_input_mat[:, perm[i:i + batchsize]], reverse_flg=False) input_batch_rev = remove_extra_padding( train_input_mat_rev[:, perm[i:i + batchsize]], reverse_flg=True) output_batch = remove_extra_padding( train_output_mat[:, perm[i:i + batchsize]], reverse_flg=False) label_batch = remove_extra_padding( train_label_mat[:, perm[i:i + batchsize]], reverse_flg=False) # Encode a sentence model.initialize( batch_size=input_batch.shape[1]) # initialize cell model.encode(input_batch, input_batch_rev, train=True) # encode (output: hidden Variable) # Decode from encoded context input_ids = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) for w_ids, l_ids in zip(output_batch, label_batch): loss, predict_mat = model.decode(input_ids, w_ids, label_id=l_ids, word_th=word_threshold, train=True) input_ids = w_ids accum_loss += loss # learn model model.cleargrads() # initialize all grad to zero accum_loss.backward() # back propagation optimizer.update() total_loss += float(accum_loss.data) batch_num += 1 print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) accum_loss = 0 train_loss_data.append(float(total_loss / batch_num)) # save model and optimizer print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '.model', model) serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer) # save loss data with open('./data/loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f)
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # load word2vec model sim_th = 50 w2v_model = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=False) # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break if args.lang == 'en': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] elif args.lang == 'ja': input_vocab = parse_ja_text(sentence) input_vocab.reverse() input_vocab.insert(0, "<eos>") #print(input_vocab) # convert word into ID input_sentence = [] for word in input_vocab: if corpus.dic.token2id.get(word) is not None: input_sentence.append(corpus.dic.token2id.get(word)) else: try: sim_words = w2v_model.most_similar(positive=[word], topn=sim_th) for index, candidate_tuple in enumerate(sim_words): if corpus.dic.token2id.get( candidate_tuple[0]) is not None: input_sentence.append( corpus.dic.token2id.get(candidate_tuple[0])) break if index == sim_th - 1: input_sentence.append(corpus.dic.token2id['<unk>']) except KeyError: input_sentence.append(corpus.dic.token2id['<unk>']) #print(input_sentence) # input a sentence into model model.initialize() hypotheses = model.beam_search(model.initial_state_function, model.generate_function, input_sentence, start_id=corpus.dic.token2id['<start>'], end_id=corpus.dic.token2id['<eos>'], word2id=corpus.dic.token2id, id2word=corpus.dic) for hypothesis in hypotheses: generated_indices = hypothesis.to_sequence_of_values() generated_tokens = [corpus.dic[i] for i in generated_indices] print("--> ", " ".join(generated_tokens)) print("-> ", sentence) print('')
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) ###################### #### create model #### ###################### model = Seq2Seq(len(corpus.dic.token2id), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, gpu_flg=args.gpu) if args.gpu >= 0: model.to_gpu() optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(5)) # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) ########################## #### create ID corpus #### ########################## test_input_mat = [] test_output_mat = [] train_input_mats = [] train_output_mats = [] if not os.path.exists('./data/corpus/input_mat0.npy'): print( "You don't have any input matrix. You should run 'preprocess.py' before you run this script." ) raise ValueError else: for index, text_name in enumerate(glob.glob('data/corpus/input_mat*')): batch_input_mat = np.load(text_name) if index == 0: # separate corpus into Train and Test perm = np.random.permutation(batch_input_mat.shape[1]) test_input_mat = batch_input_mat[:, perm[0:0 + testsize]] train_input_mats.append(batch_input_mat[:, perm[testsize:]]) else: train_input_mats.append(batch_input_mat) for index, text_name in enumerate( glob.glob('data/corpus/output_mat*')): batch_output_mat = np.load(text_name) if index == 0: # separate corpus into Train and Test test_output_mat = batch_output_mat[:, perm[0:0 + testsize]] train_output_mats.append(batch_output_mat[:, perm[testsize:]]) else: train_output_mats.append(batch_output_mat) list_of_references = [] for text_ndarray in test_output_mat.T: reference = text_ndarray.tolist() references = [[w_id for w_id in reference if w_id is not -1]] list_of_references.append(references) ############################# #### train seq2seq model #### ############################# matrix_row_size = train_input_mats[0].shape[1] - testsize accum_loss = 0 train_loss_data = [] test_loss_data = [] bleu_score_data = [] wer_score_data = [] for num, epoch in enumerate(range(n_epoch)): total_loss = test_loss = batch_num = 0 # for training by each corpus matrix for mat_index in range(len(train_input_mats)): perm = np.random.permutation(matrix_row_size) # by each batch size for i in range(0, matrix_row_size, batchsize): # select batch data input_batch = train_input_mats[mat_index][:, perm[i:i + batchsize]] output_batch = train_output_mats[mat_index][:, perm[i:i + batchsize]] # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = first_words accum_loss += loss for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = w_ids accum_loss += loss # learn model model.cleargrads() accum_loss.backward() #accum_loss.unchain_backward() optimizer.update() total_loss += float(accum_loss.data) print('Epoch: ', num, 'Matrix_num: ', mat_index, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) batch_num += 1 accum_loss = 0 # # for testing by 1epoch # list_of_hypotheses = [] # for i in range(0, testsize, batchsize): # # # select test batch data # input_batch = test_input_mat[:, i:i + batchsize] # output_batch = test_output_mat[:, i:i + batchsize] # # # Encode a sentence # model.initialize() # initialize cell # model.encode(input_batch, train=True) # encode (output: hidden Variable) # # # Decode from encoded context # end_batch = xp.array([corpus.dic.token2id["<start>"] for _ in range(batchsize)]) # first_words = output_batch[0] # loss, predict_mat = model.decode(end_batch, first_words, train=True) # next_ids = xp.argmax(predict_mat.data, axis=1) # test_loss += loss # if args.gpu >= 0: # hypotheses = [cuda.to_cpu(next_ids)] # else: # hypotheses = [next_ids] # for w_ids in output_batch[1:]: # loss, predict_mat = model.decode(next_ids, w_ids, train=True) # next_ids = xp.argmax(predict_mat.data, axis=1) # test_loss += loss # if args.gpu >= 0: # hypotheses.append(cuda.to_cpu(next_ids)) # else: # hypotheses.append(next_ids) # # # collect hypotheses for calculating BLEU score # hypotheses = np.array(hypotheses).T # for hypothesis in hypotheses: # text_list = hypothesis.tolist() # list_of_hypotheses.append([w_id for w_id in text_list if w_id is not -1]) # # # calculate BLEU score from test (develop) data # bleu_score = nltk.translate.bleu_score.corpus_bleu(list_of_references, list_of_hypotheses, # weights=(0.25, 0.25, 0.25, 0.25)) # bleu_score_data.append(bleu_score) # print('Epoch: ', num, 'BLEU SCORE: ', bleu_score) # # # calculate WER score from test (develop) data # wer_score = 0 # for index, references in enumerate(list_of_references): # wer_score += wer(references[0], list_of_hypotheses[index]) # wer_score /= len(list_of_references) # wer_score_data.append(wer_score) # print('Epoch: ', num, 'WER SCORE: ', wer_score) # # # evaluate a test loss # check_loss = test_loss_data[-10:] # check out the last 10 loss data # end_flg = [j for j in range(len(check_loss) - 1) if check_loss[j] < check_loss[j + 1]] # if len(end_flg) > 9: # print('Probably it is over-fitting. So stop to learn...') # break # save model and optimizer if (epoch + 1) % 10 == 0: print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '_rough.model', model) serializers.save_hdf5('data/' + str(epoch) + '_rough.state', optimizer) # display the on-going status print('Epoch: ', num, 'Train loss: {:.2f}'.format(total_loss), 'Test loss: {:.2f}'.format(float(test_loss.data))) train_loss_data.append(float(total_loss / batch_num)) test_loss_data.append(float(test_loss.data)) # save loss data with open('./data/rough_loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f) with open('./data/rough_loss_test_data.pkl', 'wb') as f: pickle.dump(test_loss_data, f) with open('./data/rough_bleu_score_data.pkl', 'wb') as f: pickle.dump(bleu_score_data, f) with open('./data/rough_wer_score_data.pkl', 'wb') as f: pickle.dump(wer_score_data, f)
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) ################################## #### create model (copy data) #### ################################## rough_model = './data/199_rough.model' model = Seq2Seq(len(corpus.dic.token2id), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, gpu_flg=args.gpu) serializers.load_hdf5(rough_model, model) if args.gpu >= 0: model.to_gpu() optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(5)) # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] max_input_ren = max_output_ren = 0 for input_text, output_text in zip(corpus.fine_posts, corpus.fine_cmnts): # convert to list input_text.reverse() # encode words in a reverse order input_text.insert(0, corpus.dic.token2id["<eos>"]) output_text.append(corpus.dic.token2id["<eos>"]) # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) input_mat.append(input_text) output_mat.append(output_text) # padding for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) # create batch matrix input_mat = np.array(input_mat, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T # separate corpus into Train and Test perm = np.random.permutation(len(corpus.fine_posts)) test_input_mat = input_mat[:, perm[0:0 + testsize]] test_output_mat = output_mat[:, perm[0:0 + testsize]] train_input_mat = input_mat[:, perm[testsize:]] train_output_mat = output_mat[:, perm[testsize:]] list_of_references = [] for text_ndarray in test_output_mat.T: reference = text_ndarray.tolist() references = [[w_id for w_id in reference if w_id is not -1]] list_of_references.append(references) ############################# #### train seq2seq model #### ############################# accum_loss = 0 train_loss_data = [] test_loss_data = [] bleu_score_data = [] wer_score_data = [] for num, epoch in enumerate(range(n_epoch)): total_loss = test_loss = 0 batch_num = 0 perm = np.random.permutation(len(corpus.fine_posts) - testsize) # for training for i in range(0, len(corpus.fine_posts) - testsize, batchsize): # select batch data input_batch = train_input_mat[:, perm[i:i + batchsize]] output_batch = train_output_mat[:, perm[i:i + batchsize]] # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = first_words accum_loss += loss for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = w_ids accum_loss += loss # learn model model.cleargrads() # initialize all grad to zero accum_loss.backward() # back propagation optimizer.update() total_loss += float(accum_loss.data) batch_num += 1 print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) accum_loss = 0 # for testing list_of_hypotheses = [] for i in range(0, testsize, batchsize): # select test batch data input_batch = test_input_mat[:, i:i + batchsize] output_batch = test_output_mat[:, i:i + batchsize] # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = xp.argmax(predict_mat.data, axis=1) test_loss += loss if args.gpu >= 0: hypotheses = [cuda.to_cpu(next_ids)] else: hypotheses = [next_ids] for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = xp.argmax(predict_mat.data, axis=1) test_loss += loss.data if args.gpu >= 0: hypotheses.append(cuda.to_cpu(next_ids)) else: hypotheses.append(next_ids) # collect hypotheses for calculating BLEU score hypotheses = np.array(hypotheses).T for hypothesis in hypotheses: text_list = hypothesis.tolist() list_of_hypotheses.append( [w_id for w_id in text_list if w_id is not -1]) # calculate BLEU score from test (develop) data bleu_score = nltk.translate.bleu_score.corpus_bleu(list_of_references, list_of_hypotheses, weights=(0.25, 0.25, 0.25, 0.25)) bleu_score_data.append(bleu_score) print('Epoch: ', num, 'BLEU SCORE: ', bleu_score) # calculate WER score from test (develop) data wer_score = 0 for index, references in enumerate(list_of_references): wer_score += wer(references[0], list_of_hypotheses[index]) wer_score /= len(list_of_references) wer_score_data.append(wer_score) print('Epoch: ', num, 'WER SCORE: ', wer_score) # save model and optimizer if (epoch + 1) % 10 == 0: print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '_fine.model', model) serializers.save_hdf5('data/' + str(epoch) + '_fine.state', optimizer) # display the on-going status print('Epoch: ', num, 'Train loss: {:.2f}'.format(total_loss), 'Test loss: {:.2f}'.format(float(test_loss))) train_loss_data.append(float(total_loss / batch_num)) test_loss_data.append(float(test_loss)) # evaluate a test loss check_loss = test_loss_data[-10:] # check out the last 10 loss data end_flg = [ j for j in range(len(check_loss) - 1) if check_loss[j] < check_loss[j + 1] ] if len(end_flg) > 9: print('Probably it is over-fitting. So stop to learn...') break # save loss data with open('./data/fine_loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f) with open('./data/fine_loss_test_data.pkl', 'wb') as f: pickle.dump(test_loss_data, f) with open('./data/fine_bleu_score_data.pkl', 'wb') as f: pickle.dump(bleu_score_data, f) with open('./data/fine_wer_score_data.pkl', 'wb') as f: pickle.dump(wer_score_data, f)
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): corpus = JaConvCorpus(file_path=None, batch_size=batchsize, size_filter=True) corpus.load(load_dir='./data/corpus/') else: corpus = JaConvCorpus(file_path=data_file, batch_size=batchsize, size_filter=True) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) ################################## #### create model (copy data) #### ################################## rough_model = './data/199_rough.model' model = Seq2Seq(len(corpus.dic.token2id), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, gpu_flg=args.gpu) serializers.load_hdf5(rough_model, model) if args.gpu >= 0: model.to_gpu() ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] max_input_ren = max_output_ren = 0 for input_text, output_text in zip(corpus.fine_posts, corpus.fine_cmnts): # convert to list input_text.reverse() # encode words in a reverse order input_text.insert(0, corpus.dic.token2id["<eos>"]) output_text.append(corpus.dic.token2id["<eos>"]) # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) input_mat.append(input_text) output_mat.append(output_text) # padding for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) # create batch matrix input_mat = np.array(input_mat, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T # separate corpus into Train and Test train_input_mat = input_mat train_output_mat = output_mat ############################# #### train seq2seq model #### ############################# accum_loss = 0 train_loss_data = [] for num, epoch in enumerate(range(n_epoch)): total_loss = 0 batch_num = 0 perm = np.random.permutation(len(corpus.fine_posts) - testsize) # initialize optimizer optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) # optimizer.add_hook(chainer.optimizer.GradientClipping(5)) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) # for training for i in range(0, len(corpus.fine_posts) - testsize, batchsize): # select batch data input_batch = train_input_mat[:, perm[i:i + batchsize]] output_batch = train_output_mat[:, perm[i:i + batchsize]] # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = first_words accum_loss += loss for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = w_ids accum_loss += loss # learn model model.cleargrads() # initialize all grad to zero accum_loss.backward() # back propagation optimizer.update() total_loss += float(accum_loss.data) print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) accum_loss = 0 # save model and optimizer if (epoch + 1) % 5 == 0: print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '_fine.model', model) serializers.save_hdf5('data/' + str(epoch) + '_fine.state', optimizer) # save loss data with open('./data/fine_loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f)