def test_run(data_path, model_path, n_show=10): """ Test function. Input is training data. Output have to be the sentence which is correct data in training phase. :return: """ corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run an interpreter for num, input_sentence in enumerate(corpus.posts): id_sequence = input_sentence.copy() # input_sentence.reverse() # input_sentence.insert(0, corpus.dic.token2id["<eos>"]) model.initialize() # initialize cell sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic) print("teacher : ", " ".join([corpus.dic[w_id] for w_id in id_sequence])) print("correct :", " ".join([corpus.dic[w_id] for w_id in corpus.cmnts[num]])) print("-> ", sentence) print('') if num == n_show: break
def show_heatmap(data_path, model_path): import seaborn import matplotlib seaborn.set() matplotlib.rc('font', family='sans-serif') # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) elif args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # sentiment matrix in the decoder sentiment_mat = model.dec.le.W.data cmap = seaborn.diverging_palette( 220, 10, as_cmap=True) # Generate a custom diverging colormap seaborn.heatmap(sentiment_mat, cmap=cmap, center=0, linewidths=.5, xticklabels=False ) # square=True, cbar_kws={"orientation": "horizontal"}) plt.xlabel("Dimension (=" + str(sentiment_mat.shape[1]) + ")") plt.ylabel("Sentiment") plt.savefig('./data/sentiment_matrix.png') # encoder embedding matrix encode_mat = model.enc.xe.W seaborn.heatmap(encode_mat, cmap=cmap, center=0, linewidths=.5, xticklabels=False ) # square=True, cbar_kws={"orientation": "horizontal"}) plt.xlabel("Dimension (=" + str(sentiment_mat.shape[1]) + ")") plt.ylabel("Sentiment") plt.savefig('./data/sentiment_matrix.png')
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] input_vocab.reverse() input_vocab.insert(0, "<eos>") # convert word into ID input_sentence = [ corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None ] model.initialize() # initialize cell sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic) print("-> ", sentence) print('')
def main(): ########################### #### create dictionary #### ########################### if os.path.exists('./data/corpus/dictionary.dict'): corpus = ConvCorpus(file_path=None, batch_size=batchsize) corpus.load(load_dir='./data/corpus/') else: corpus = ConvCorpus(file_path=args.data, batch_size=batchsize) corpus.save(save_dir='./data/corpus/') print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) ###################### #### create model #### ###################### model = Seq2Seq(len(corpus.dic.token2id), feature_num=feature_num, hidden_num=hidden_num, batch_size=batchsize, gpu_flg=args.gpu) if args.gpu >= 0: model.to_gpu() optimizer = optimizers.Adam(alpha=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(5)) # optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) ########################## #### create ID corpus #### ########################## input_mat = [] output_mat = [] max_input_ren = max_output_ren = 0 for input_text, output_text in zip(corpus.posts, corpus.cmnts): # convert to list input_text.reverse() # encode words in a reverse order input_text.insert(0, corpus.dic.token2id["<eos>"]) output_text.append(corpus.dic.token2id["<eos>"]) # update max sentence length max_input_ren = max(max_input_ren, len(input_text)) max_output_ren = max(max_output_ren, len(output_text)) input_mat.append(input_text) output_mat.append(output_text) # padding for li in input_mat: insert_num = max_input_ren - len(li) for _ in range(insert_num): li.insert(0, corpus.dic.token2id['<pad>']) for li in output_mat: insert_num = max_output_ren - len(li) for _ in range(insert_num): li.append(corpus.dic.token2id['<pad>']) # create batch matrix input_mat = np.array(input_mat, dtype=np.int32).T output_mat = np.array(output_mat, dtype=np.int32).T # separate corpus into Train and Test perm = np.random.permutation(len(corpus.posts)) test_input_mat = input_mat[:, perm[0:0 + batchsize]] test_output_mat = output_mat[:, perm[0:0 + batchsize]] train_input_mat = input_mat[:, perm[batchsize:]] train_output_mat = output_mat[:, perm[batchsize:]] ############################# #### train seq2seq model #### ############################# accum_loss = 0 train_loss_data = [] test_loss_data = [] for num, epoch in enumerate(range(n_epoch)): total_loss = test_loss = 0 batch_num = 0 perm = np.random.permutation(len(corpus.posts) - batchsize) for i in range(0, len(corpus.posts) - batchsize, batchsize): # select batch data input_batch = train_input_mat[:, perm[i:i + batchsize]] output_batch = train_output_mat[:, perm[i:i + batchsize]] # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = first_words accum_loss += loss for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = w_ids accum_loss += loss # learn model model.cleargrads() # initialize all grad to zero accum_loss.backward() # back propagation optimizer.update() total_loss += float(accum_loss.data) batch_num += 1 print('Epoch: ', num, 'Batch_num', batch_num, 'batch loss: {:.2f}'.format(float(accum_loss.data))) accum_loss = 0 else: # select last batch data input_batch = test_input_mat output_batch = test_output_mat # Encode a sentence model.initialize() # initialize cell model.encode(input_batch, train=True) # encode (output: hidden Variable) # Decode from encoded context end_batch = xp.array( [corpus.dic.token2id["<start>"] for _ in range(batchsize)]) first_words = output_batch[0] loss, predict_mat = model.decode(end_batch, first_words, train=True) next_ids = xp.argmax(predict_mat.data, axis=1) test_loss += loss for w_ids in output_batch[1:]: loss, predict_mat = model.decode(next_ids, w_ids, train=True) next_ids = xp.argmax(predict_mat.data, axis=1) test_loss += loss # save model and optimizer if (epoch + 1) % 10 == 0: print('-----', epoch + 1, ' times -----') print('save the model and optimizer') serializers.save_hdf5('data/' + str(epoch) + '.model', model) serializers.save_hdf5('data/' + str(epoch) + '.state', optimizer) # display the on-going status print('Epoch: ', num, 'Train loss: {:.2f}'.format(total_loss), 'Test loss: {:.2f}'.format(float(test_loss.data))) train_loss_data.append(float(total_loss / batch_num)) test_loss_data.append(float(test_loss.data)) # evaluate a test loss check_loss = test_loss_data[-10:] # check out the last 10 loss data end_flg = [ j for j in range(len(check_loss) - 1) if check_loss[j] < check_loss[j + 1] ] if len(end_flg) > 7: print('Probably it is over-fitting. So stop to learn...') break # save loss data with open('./data/loss_train_data.pkl', 'wb') as f: pickle.dump(train_loss_data, f) with open('./data/loss_test_data.pkl', 'wb') as f: pickle.dump(test_loss_data, f)
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) elif args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break # convert to a list if args.lang == 'en': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] elif args.lang == 'ja': input_vocab = parse_ja_text(sentence) else: print("Sorry, but your language is not supported...") raise ValueError # check a sentiment tag label_id = -1 if len(input_vocab) == 0: print('caution: you donot set any words!)') pass elif input_vocab[-1] == '2': del input_vocab[-1] label_id = 1 elif input_vocab[-1] == '1': del input_vocab[-1] label_id = 0 else: print('caution: you donot set any sentiment tags!') break # input_vocab.reverse() # input_vocab.insert(0, "<eos>") # convert word into ID input_sentence = [ corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None ] model.initialize() # initialize cell sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic, label_id=label_id) print("-> ", sentence) print('')
def calculate_embedding_vectors(data_path, model_path): # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) elif args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, label_num=args.label_num, label_embed_num=args.label_embed, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # get embedding vectors embed_mat = model.dec.ye.W.data sentiment_mat = model.dec.le.W.data neg_vec = np.array([sentiment_mat[0, :]]) pos_vec = np.array([sentiment_mat[1, :]]) # calculate cos similarity neg_sim_dic = {} pos_sim_dic = {} for i in range(embed_mat.shape[0]): word_vec = np.array([embed_mat[i, :]]) neg_sim_dic[i] = cosine_similarity(word_vec, neg_vec) pos_sim_dic[i] = cosine_similarity(word_vec, pos_vec) # if cosine_similarity(word_vec, pos_vec) > cosine_similarity(word_vec, neg_vec): # print('pos: ', corpus.dic[i]) # elif cosine_similarity(word_vec, pos_vec) < cosine_similarity(word_vec, neg_vec): # print('neg: ', corpus.dic[i]) # else: # print('???: ', corpus.dic[i]) # raise ValueError # sort in descending order neg_ordered = collections.OrderedDict( sorted(neg_sim_dic.items(), key=lambda x: x[1], reverse=True)) pos_ordered = collections.OrderedDict( sorted(pos_sim_dic.items(), key=lambda x: x[1], reverse=True)) # show TOP50 words print('------- The words which is similar to a NEGATIVE tag --------') for index, w_index in enumerate(neg_ordered): print(corpus.dic[w_index], ': ', neg_ordered[w_index][0, 0]) if index == 49: break print('------- The words which is similar to a POSITIVE tag --------') for index, w_index in enumerate(pos_ordered): print(corpus.dic[w_index], ': ', pos_ordered[w_index][0, 0]) if index == 49: break
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) elif args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print( 'You gave wrong argument to this system. Check out your argument about languages.' ) raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break if args.lang == 'en': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence) ] elif args.lang == 'ja': input_vocab = [ unicodedata.normalize('NFKC', word.lower()) for word in parse_ja_text(sentence) ] input_vocab_rev = input_vocab[::-1] # convert word into ID input_sentence = [ corpus.dic.token2id[word] for word in input_vocab if not corpus.dic.token2id.get(word) is None ] input_sentence_rev = [ corpus.dic.token2id[word] for word in input_vocab_rev if not corpus.dic.token2id.get(word) is None ] model.initialize(batch_size=1) # initialize cell sentence = model.generate(input_sentence, input_sentence_rev, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic) print("-> ", sentence) print('')