def prepare_data(config): train_path = os.path.join(config.train_dir, "chitchat.train") data_path_list = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(config.train_dir, "vocab%d.all" % config.vocab_size) data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # # if os.path.isfile(config.dev_set) and os.path.isfile(config.train_set): # dev_set_file = open(config.dev_set, "rb") # dev_set = pickle.load(dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "rb") # train_set = pickle.load(train_set_file) # train_set_file.close() # else: print("Prepare Chitchat data in %s" % config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( config.train_dir, vocab, config.vocab_size) print("Reading development and training data (limit: %d)." % config.max_train_data_size) dev_set = read_data(config, dev_query, dev_answer) train_set = read_data(config, train_query, train_answer) # dev_set_file = open(config.dev_set, "wb") # pickle.dump(dev_set, dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "wb") # pickle.dump(train_set, train_set_file) # train_set_file.close() return vocab, rev_vocab, dev_set, train_set
def create_load_vocab(arg, file_name, out_file_name, pad=True, unk=True, sos_eos=False): """Creates and loads the vocab file for a given corpus. Args: arg: The output of the parser. file_name: The name of the file containing the corpus. out_file_name: The file into which the vocab should be written into. pad: A boolean to indicate if the pad token should be included in the vocabulary. unk: A boolean to indicate if the unknown token should be included in the vocabulary. sos_eos: A boolean to indicate if the SOS and EOS token should be included in the vocabulary. Returns: A dictionary of the vocabulary and it's corresponding index. It also includes a list of all the vocabulary. """ full_path = os.path.join('./top_data', arg.train_data_path, file_name) output_path = os.path.join(arg.vocab_path, out_file_name) create_vocabulary(full_path, output_path, pad, unk, sos_eos) vocab = load_vocabulary(output_path) return vocab
def do_word2vec(): my_len = 15000000 data_utils.create_vocabulary('data/topic/topic_index.vocal', 'data/topic/topic_index.txt', my_len) data_utils.data_to_token_ids('data/topic/topic_index.txt', 'data/topic/topic_index.vec', 'data/topic/topic_index.vocal') data_utils.create_vocabulary('data/topic/topic_group.vocal', 'data/topic/topic_group.txt', my_len) data_utils.data_to_token_ids('data/topic/topic_group.txt', 'data/topic/topic_group.vec', 'data/topic/topic_group.vocal')
def sample(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) test_sentence = "It was the best movie I have ever seen." test_sentence = get_tokens(clean_str(test_sentence)) test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict) test_sentence = test_sentence[0] test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \ for sentence in X) - len(test_sentence))) test_sentence = np.array(test_sentence).reshape([1, -1]) FLAGS.max_sequence_length = len(test_sentence[0]) with tf.Session() as sess: model = create_model(sess, FLAGS) probability = model.step(sess, batch_X=test_sentence, batch_seq_lens=np.array(seq_len), forward_only=True, sampling=True) print probability print np.argmax(probability)
def main(_): if not FLAGS.data_dir: raise ValueError("Must set --data_dir to data directory") vocab_path = data_utils.create_vocabulary(os.path.join(FLAGS.data_dir, 'train'), FLAGS.data_dir) train_data = data_utils.read_data(os.path.join(FLAGS.data_dir, 'train'), vocab_path) valid_data = data_utils.read_data(os.path.join(FLAGS.data_dir, 'dev'), vocab_path) test_data = valid_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) m.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity)
def main(_): train_path = FLAGS.train_path ids_path = FLAGS.ids_path vocab_path = FLAGS.vocab_path vocab_size = FLAGS.vocab_size tfrecords_path = FLAGS.tfrecords_path train_percent = FLAGS.train_percent val_percent = FLAGS.val_percent words_vocab = data_utils.create_vocabulary(train_path, os.path.join(vocab_path, 'words_vocab.txt'), vocab_size) datasets = data_utils.prepare_datasets(train_path, ids_path, vocab_path, words_vocab, train_percent, val_percent) train_word_ids_list, train_label_ids_list, validation_word_ids_list, validation_label_ids_list, \ test_word_ids_list, test_label_ids_list = datasets create_record(train_word_ids_list, train_label_ids_list, os.path.join(tfrecords_path, 'train.tfrecords')) create_record(validation_word_ids_list, validation_label_ids_list, os.path.join(tfrecords_path, 'validate.tfrecords')) create_record(test_word_ids_list, test_label_ids_list, os.path.join(tfrecords_path, 'test.tfrecords')) print_all(os.path.join(tfrecords_path, 'test.tfrecords'))
def get_vocabulary(in_dataset, in_result_folder, in_config): MAX_VOCABULARY_SIZE = in_config['vocabulary_size'] vocabulary_path = path.join(in_result_folder, 'vocab.txt') if not path.exists(vocabulary_path): logger.info('Creating vocabulary') vocabulary_list = create_vocabulary( in_dataset.values.flatten(), MAX_VOCABULARY_SIZE ) vocabulary = { token: token_index for token_index, token in enumerate(vocabulary_list) } with getwriter('utf-8')(open(vocabulary_path, 'w')) as vocab_out: for word in vocabulary_list: print >> vocab_out, word else: with getreader('utf-8')(open(vocabulary_path)) as vocab_in: vocabulary = {} for line, line_index in zip(vocab_in, count()): vocabulary[line.strip()] = line_index logger.info('Skipping vocabulary creation step'.format(vocabulary_path)) return vocabulary
def test_decoder(config): train_path = os.path.join(config.train_dir, "chitchat.train") data_path_list = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(config.train_dir, "vocab%d.all" % config.vocab_size) data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) with tf.Session() as sess: if config.name_model in [ gst_config.name_model, gcc_config.name_model, gbk_config.name_model ]: model = create_st_model(sess, config, forward_only=True, name_scope=config.name_model) elif config.name_model in [ grl_config.name_model, pre_grl_config.name_model ]: model = create_rl_model(sess, config, forward_only=True, name_scope=config.name_model) model.batch_size = 1 sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab) print("token_id: ", token_ids) bucket_id = len(config.buckets) - 1 for i, bucket in enumerate(config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: print("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights, _, _ = model.get_batch( {bucket_id: [(token_ids, [1])]}, bucket_id) # st_model step if config.name_model in [ gst_config.name_model, gcc_config.name_model, gbk_config.name_model ]: output_logits, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([str(rev_vocab[output]) for output in outputs])) # beam_search step elif config.name_model in [ grl_config.name_model, pre_grl_config.name_model ]: _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, reward=1, bucket_id=bucket_id, forward_only=True) for i, output in enumerate(output_logits): print("index: %d, answer tokens: %s" % (i, str(output))) if data_utils.EOS_ID in output: output = output[:output.index(data_utils.EOS_ID)] print(" ".join([str(rev_vocab[out]) for out in output])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import re from six.moves import urllib from tensorflow.python.platform import gfile import tensorflow as tf import data_utils print(data_utils.custom_tokenizer(tf.compat.as_bytes("go 8 steps up"))) print(data_utils.custom_tokenizer(tf.compat.as_bytes("find webserver.js please"))) print(data_utils.custom_tokenizer(tf.compat.as_bytes("cd ../../../"))) data_utils.create_vocabulary('dummy/dummy_vocab.txt', 'data/data.txt', 400000) data_utils.initialize_vocabulary('dummy/dummy_vocab.txt')
def main(): opt = Options() vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary( "data/atec_nlp_sim_train2.csv", opt.vocab_size, name_scope=opt.name_scope, tokenize_style=opt.tokenize_style) vocab_size = len(vocabulary_word2index) print("vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) with open("./cache_SWEM_1/train_valid_test.pik") as f: train, valid, test, true_label_percent = pickle.load(f) train_q, train_a, _, train_lab = train print("train_nums:", len(train_q)) val_q, val_a, _, val_lab = valid test_q, test_a, _, test_lab = test wordtoix = vocabulary_word2index ixtoword = vocabulary_index2word opt.n_words = len(ixtoword) # loadpath = "./data/snli.p" # x = cPickle.load(open(loadpath, "rb")) # # train, val, test = x[0], x[1], x[2] # wordtoix, ixtoword = x[4], x[5] # # train_q, train_a, train_lab = train[0], train[1], train[2] # val_q, val_a, val_lab = val[0], val[1], val[2] # test_q, test_a, test_lab = test[0], test[1], test[2] # # train_lab = np.array(train_lab, dtype='float32') # val_lab = np.array(val_lab, dtype='float32') # test_lab = np.array(test_lab, dtype='float32') # # opt = Options() # opt.n_words = len(ixtoword) # # del x print(dict(opt)) print('Total words: %d' % opt.n_words) #若partially use labeled data则进行以下操作,这部分操作什么意思? # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小,应该是用于测试模型阶段使用的 if opt.part_data: np.random.seed(123) train_ind = np.random.choice(len(train_q), int(len(train_q) * opt.portion), replace=False) train_q = [train_q[t] for t in train_ind] train_a = [train_a[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] #验证训练集和预处理好的词嵌入文件是否对齐 try: params = np.load('./data/snli_emb.p') if params[0].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') #pdb.set_trace() opt.W_emb = np.array(params[0], dtype='float32') else: print('Emb Dimension mismatch: param_g.npz:' + str(params[0].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:0'): #注意训练数据是两批句子,所以x的占位符要成对定义 x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_mask_1_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) x_mask_2_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category]) keep_prob = tf.placeholder(tf.float32) #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧 # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入 accuracy_, loss_, train_op_, W_emb, logits_ = auto_encoder( x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt) merged = tf.summary.merge_all() def do_eval(sess, train_q, train_a, train_lab): train_correct = 0.0 # number_examples = len(train_q) # print("valid examples:", number_examples) eval_loss, eval_accc, eval_counter = 0.0, 0.0, 0 eval_true_positive, eval_false_positive, eval_true_negative, eval_false_negative = 0, 0, 0, 0 # batch_size = 1 weights_label = {} # weight_label[label_index]=(number,correct) weights = np.ones((opt.batch_size)) kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents_1 = [train_q[t] for t in train_index] train_sents_2 = [train_a[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels_array = np.array(train_labels) # print("train_labels", train_labels.shape) # train_labels = train_labels.reshape((len(train_labels), opt.category)) train_labels = np.eye(opt.category)[train_labels_array] x_train_batch_1, x_train_mask_1 = prepare_data_for_emb( train_sents_1, opt) x_train_batch_2, x_train_mask_2 = prepare_data_for_emb( train_sents_2, opt) curr_eval_loss, curr_accc, logits = sess.run( [loss_, accuracy_, logits_], feed_dict={ x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, y_: train_labels, opt.weights_label: weights, keep_prob: 1.0 }) true_positive, false_positive, true_negative, false_negative = compute_confuse_matrix( logits, train_labels ) # logits:[batch_size,label_size]-->logits[0]:[label_size] # write_predict_error_to_file(start,file_object,logits[0], evalY[start:end][0],vocabulary_index2word,evalX1[start:end],evalX2[start:end]) eval_loss, eval_accc, eval_counter = eval_loss + curr_eval_loss, eval_accc + curr_accc, eval_counter + 1 # 注意这里计算loss和accc的方法,计算累加值,然后归一化 weights_label = compute_labels_weights( weights_label, logits, train_labels_array ) # compute_labels_weights(weights_label,logits,labels) eval_true_positive, eval_false_positive = eval_true_positive + true_positive, eval_false_positive + false_positive eval_true_negative, eval_false_negative = eval_true_negative + true_negative, eval_false_negative + false_negative # weights_label = compute_labels_weights(weights_label, logits, evalY[start:end]) #compute_labels_weights(weights_label,logits,labels) print("true_positive:", eval_true_positive, ";false_positive:", eval_false_positive, ";true_negative:", eval_true_negative, ";false_negative:", eval_false_negative) p = float(eval_true_positive) / float(eval_true_positive + eval_false_positive) r = float(eval_true_positive) / float(eval_true_positive + eval_false_negative) f1_score = (2 * p * r) / (p + r) print("eval_counter:", eval_counter, ";eval_acc:", eval_accc) return eval_loss / float(eval_counter), eval_accc / float( eval_counter), f1_score, p, r, weights_label max_val_accuracy = 0. max_test_accuracy = 0. weights_dict = init_weights_dict( vocabulary_label2index) # init weights dict. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: #若使用已保存好的参数 try: #pdb.set_trace() t_vars = tf.trainable_variables() # print([var.name[:-2] for var in t_vars]) save_keys = tensors_key_in_file(opt.save_path) # pdb.set_trace() # print(save_keys.keys()) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} #pdb.set_trace() # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: best_acc = 0 best_f1_score = 0 for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) loss, acc, uidx = 0.0, 0.0, 0.0 kf = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) #随机创建minibatch数据 for _, train_index in kf: uidx += 1 sents_1 = [train_q[t] for t in train_index] #根据索引回到总数据集中寻找相应数据 sents_2 = [train_a[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels_array = np.array(x_labels) # print("x_labels:", x_labels.shape) # 为何要在这里进行reshape,是想进行onehot操作?但是这明显是错误的,((len(x_labels),))怎么能reshape成((len(x_labels),opt.category)) # x_labels = x_labels.reshape((len(x_labels),opt.category)) # one-hot向量化 x_labels = np.eye(opt.category)[x_labels_array] #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量 x_batch_1, x_batch_mask_1 = prepare_data_for_emb( sents_1, opt) x_batch_2, x_batch_mask_2 = prepare_data_for_emb( sents_2, opt) weights = get_weights_for_current_batch( list(x_labels_array), weights_dict) _, curr_loss, curr_accuracy = sess.run( [train_op_, loss_, accuracy_], feed_dict={ x_1_: x_batch_1, x_2_: x_batch_2, x_mask_1_: x_batch_mask_1, x_mask_2_: x_batch_mask_2, y_: x_labels, opt.weights_label: weights, keep_prob: opt.dropout_ratio }) loss, acc = loss + curr_loss, acc + curr_accuracy if uidx % 100 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\t" % (epoch, uidx, loss / float(uidx), acc / float(uidx))) if epoch % 1 == 0: # do_eval参数待修改 eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval( sess, train_q, train_a, train_lab) weights_dict = get_weights_label_as_standard_dict( weights_label) # print("label accuracy(used for label weight):==========>>>>", weights_dict) print( "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, eval_loss, eval_accc, f1_scoree, precision, recall)) # save model to checkpoint if eval_accc > best_acc and f1_scoree > best_f1_score: save_path = opt.ckpt_dir + "/model.ckpt" print("going to save model. eval_f1_score:", f1_scoree, ";previous best f1 score:", best_f1_score, ";eval_acc", str(eval_accc), ";previous best_acc:", str(best_acc)) saver.save(sess, save_path, global_step=epoch) best_acc = eval_accc best_f1_score = f1_scoree test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval( sess, test_q, test_a, test_lab) print( "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:" % (test_loss, acc_t, f1_score_t, precision, recall)) #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率 # if uidx % opt.valid_freq == 0: # train_correct = 0.0 # kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) # for _, train_index in kf_train: # train_sents_1 = [train_q[t] for t in train_index] # train_sents_2 = [train_a[t] for t in train_index] # train_labels = [train_lab[t] for t in train_index] # train_labels = np.array(train_labels) # # print("train_labels", train_labels.shape) # # train_labels = train_labels.reshape((len(train_labels), opt.category)) # train_labels = np.eye(opt.category)[train_labels] # x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt) # x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt) # # train_accuracy = sess.run(accuracy_, # feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, # y_: train_labels, keep_prob: 1.0}) # # train_correct += train_accuracy * len(train_index) # # train_accuracy = train_correct / len(train_q) # # # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx, # # loss, dis_loss, rec_loss)) # print("Train accuracy %f " % train_accuracy) # # val_correct = 0.0 # is_train = True # kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True) # for _, val_index in kf_val: # val_sents_1 = [val_q[t] for t in val_index] # val_sents_2 = [val_a[t] for t in val_index] # val_labels = [val_lab[t] for t in val_index] # val_labels = np.array(val_labels) # # val_labels = val_labels.reshape((len(val_labels), opt.category)) # val_labels = np.eye(opt.category)[val_labels] # x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt) # x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt) # # val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2, # x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0}) # # val_correct += val_accuracy * len(val_index) # # val_accuracy = val_correct / len(val_q) # # print("Validation accuracy %f " % val_accuracy) # # if val_accuracy > max_val_accuracy: # max_val_accuracy = val_accuracy # # test_correct = 0.0 # kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True) # for _, test_index in kf_test: # test_sents_1 = [test_q[t] for t in test_index] # test_sents_2 = [test_a[t] for t in test_index] # test_labels = [test_lab[t] for t in test_index] # test_labels = np.array(test_labels) # # test_labels = test_labels.reshape((len(test_labels), opt.category)) # test_labels = np.eye(opt.category)[test_labels] # x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt) # x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt) # # test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2, # x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2, # y_: test_labels, keep_prob: 1.0}) # # test_correct += test_accuracy * len(test_index) # # test_accuracy = test_correct / len(test_q) # # print("Test accuracy %f " % test_accuracy) # # max_test_accuracy = test_accuracy # # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) # # print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def train(): data_path = FLAGS.data_path dev_data = FLAGS.dev_data vocab_path = FLAGS.vocab_path # Beam search is false during training operation and usedat inference . beam_search = False beam_size = 10 attention = FLAGS.attention normalize_digits = True create_vocabulary(vocab_path, data_path, FLAGS.en_vocab_size) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, beam_search=beam_search, beam_size=beam_size, attention=attention) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) train_set = read_chat_data(data_path, vocab_path, FLAGS.max_train_data_size) dev_set = read_chat_data(dev_data, vocab_path, FLAGS.max_train_data_size) print("Finish reading development and training data") train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. # print "Started" random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False, beam_search) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. print("Running epochs") perplexity = math.exp(loss) if loss < 300 else float('inf') print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "chat_bot.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # # Run evals on development set and print their perplexity. for bucket_id in range(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, beam_search) eval_ppx = math.exp( eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def train(): """Train a en->fr translation model using WMT data.""" #with tf.device("/gpu:0"): # Prepare WMT data. train_path = os.path.join(FLAGS.data_dir, "weibo") fixed_path = os.path.join(FLAGS.data_dir, "fixed") weibo_path = os.path.join(FLAGS.data_dir, "wb") qa_path = os.path.join(FLAGS.data_dir, "qa") voc_file_path = [ train_path + ".answer", fixed_path + ".answer", weibo_path + ".answer", qa_path + ".answer", train_path + ".query", fixed_path + ".query", weibo_path + ".query", qa_path + ".query" ] vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print(len(vocab)) print("Preparing Chitchat data in %s" % FLAGS.data_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( FLAGS.data_dir, vocab, FLAGS.vocab_size) print("Preparing Fixed data in %s" % FLAGS.fixed_set_path) fixed_path = os.path.join(FLAGS.fixed_set_path, "wb") fixed_query, fixed_answer = data_utils.prepare_defined_data( fixed_path, vocab, FLAGS.vocab_size) print("Preparing Weibo data in %s" % FLAGS.weibo_set_path) weibo_path = os.path.join(FLAGS.weibo_set_path, "wb") weibo_query, weibo_answer = data_utils.prepare_defined_data( weibo_path, vocab, FLAGS.vocab_size) print("Preparing QA data in %s" % FLAGS.qa_set_path) qa_path = os.path.join(FLAGS.qa_set_path, "wb") qa_query, qa_answer = data_utils.prepare_defined_data( qa_path, vocab, FLAGS.vocab_size) dummy_path = os.path.join(FLAGS.data_dir, "dummy") dummy_set = data_utils.get_dummy_set(dummy_path, vocab, FLAGS.vocab_size) print("Get Dummy Set : ", dummy_set) if FLAGS.reinforce_learning == True and FLAGS.dual_learning == False: import data0_utils as du config = {} config['fill_word'] = du._PAD_ config['embedding'] = du.embedding config['fold'] = 1 config['model_file'] = "model_mp" config['log_file'] = "dis.log" config['train_iters'] = 50000 config['model_tag'] = "mxnet" config['batch_size'] = 64 config['data1_maxlen'] = 46 config['data2_maxlen'] = 74 config['data1_psize'] = 5 config['data2_psize'] = 5 from importlib import import_module mo = import_module(config['model_file']) disModel = mo.Model(config) disSess = tf.Session() disModel.init_step(disSess) if sys.argv[1] != "no": disModel.saver.restore(disSess, sys.argv[1]) outputFile = open("RL_ouput.txt", "w") lofFile = open("log.txt", "w") tfconfig = tf.ConfigProto() tfconfig.gpu_options.allow_growth = True with tf.Session(config=tfconfig) as sess: #with tf.device("/gpu:1"): # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, dummy_set, False, False) if FLAGS.dual_learning: du_model = create_model(sess, dummy_set, False, True) #sess.run(model.learning_rate_set_op) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] en_dict_cover = {} fr_dict_cover = {} if model.global_step.eval() > FLAGS.steps_per_checkpoint: try: with open(FLAGS.en_cover_dict_path, "rb") as ef: en_dict_cover = pickle.load(ef) # for line in ef.readlines(): # line = line.strip() # key, value = line.strip(",") # en_dict_cover[int(key)]=int(value) except Exception: print("no find query_cover_file") try: with open(FLAGS.ff_cover_dict_path, "rb") as ff: fr_dict_cover = pickle.load(ff) # for line in ff.readlines(): # line = line.strip() # key, value = line.strip(",") # fr_dict_cover[int(key)]=int(value) except Exception: print("no find answer_cover_file") step_loss_summary = tf.Summary() #merge = tf.merge_all_summaries() writer = tf.summary.FileWriter("./logs/", sess.graph) while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. for ind in range(30): dev_set = read_data(dev_query, dev_answer, 0, 3000000) train_set = read_data(train_query, train_answer, ind * 100000, (ind + 1) * 100000) fixed_set = read_data(fixed_query, fixed_answer, FLAGS.max_train_data_size) weibo_set = read_data(weibo_query, weibo_answer, FLAGS.max_train_data_size) qa_set = read_data(qa_query, qa_answer, FLAGS.max_train_data_size) train_bucket_sizes = [ len(train_set[b]) for b in xrange(len(_buckets)) ] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] for kk in range(500): random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch( train_set, bucket_id, 0, fixed_set, weibo_set, qa_set) inv_encoder_inputs, inv_decoder_inputs, inv_target_weights, inv_batch_source_encoder, inv_batch_source_decoder = model.inverse( batch_source_encoder, batch_source_decoder, bucket_id) if FLAGS.reinforce_learning: if FLAGS.dual_learning: _, step_loss1, _ = model.step_dual( sess, _buckets, encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder, bucket_id, du_model, rev_vocab=rev_vocab) _, step_loss2, _ = du_model.step_dual( sess, _buckets, inv_encoder_inputs, inv_decoder_inputs, inv_target_weights, inv_batch_source_encoder, inv_batch_source_decoder, bucket_id, model, rev_vocab=rev_vocab) step_loss = [] for ii in range(len(step_loss1)): step_loss.append(step_loss1[ii] + step_loss2[ii]) else: _, step_loss, _ = model.step_rl( sess, _buckets, encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder, bucket_id, rev_vocab=rev_vocab, disSession=disSess, disModel=disModel) else: _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False, force_dec_input=True) lossmean = 0. for ii in step_loss: lossmean = lossmean + ii lossmean = lossmean / len(step_loss) loss += lossmean / FLAGS.steps_per_checkpoint step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint current_step += 1 query_size, answer_size = _buckets[bucket_id] for batch_index in xrange(FLAGS.batch_size): for query_index in xrange(query_size): query_word = encoder_inputs[query_index][ batch_index] if en_dict_cover.has_key(query_word): en_dict_cover[query_word] += 1 else: en_dict_cover[query_word] = 0 for answer_index in xrange(answer_size): answer_word = decoder_inputs[answer_index][ batch_index] if fr_dict_cover.has_key(answer_word): fr_dict_cover[answer_word] += 1 else: fr_dict_cover[answer_word] = 0 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: outputFile = open( "OpenSubData/RL_" + str(model.global_step.eval()) + ".txt", "w") bucket_value = step_loss_summary.value.add() bucket_value.tag = "loss" bucket_value.simple_value = float(loss) writer.add_summary(step_loss_summary, current_step) print("query_dict_cover_num: %s" % (str(en_dict_cover.__len__()))) print("answer_dict_cover_num: %s" % (str(fr_dict_cover.__len__()))) ef = open(FLAGS.en_cover_dict_path, "wb") pickle.dump(en_dict_cover, ef) ff = open(FLAGS.ff_cover_dict_path, "wb") pickle.dump(fr_dict_cover, ff) num = 0 pick = 0. mmm = 1 eval_loss = 0 dictt = {} dictt_b = {} for idd in range(2): bucket_id = idd + 2 batch_num = 1 + int( len(dev_set[bucket_id]) / FLAGS.batch_size) for mm in range(batch_num): encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch_dev( dev_set, bucket_id, mm * FLAGS.batch_size, fixed_set, weibo_set, qa_set) _, eval_loss_per, output_logits = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, force_dec_input=False) #_, eval_loss_per, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False, force_dec_input=True) eval_loss += np.mean(eval_loss_per) resp_tokens = model.remove_type( output_logits, model.buckets[bucket_id], type=1) #prob = model.calprob(sess,_buckets, encoder_inputs, decoder_inputs, target_weights,batch_source_encoder, batch_source_decoder, bucket_id,rev_vocab=rev_vocab) resp_c = model.ids2tokens( resp_tokens, rev_vocab) resp_b = model.ids2tokens( batch_source_decoder, rev_vocab) resp_a = model.ids2tokens( batch_source_encoder, rev_vocab) for ii in range(len(resp_a)): aa = "" for ww in resp_a[ii]: aa = aa + " " + ww bb = "" for ww in resp_b[ii]: bb = bb + " " + ww cc = "" pre = "" for ww in resp_c[ii]: cc = cc + " " + ww if ww not in dictt: dictt[ww] = 0 if pre + ww not in dictt_b: dictt_b[pre + ww] = 0 dictt[ww] += 1 dictt_b[pre + ww] += 1 pre = ww #print("Q:",aa) #print("A1:",bb) #print("A2:",cc) #print("\n") outputFile.write("%s\n%s\n%s \n\n" % (aa, bb, cc)) outputFile.flush() BLEUscore = nltk.translate.bleu_score.sentence_bleu( [resp_c[ii]], resp_b[ii]) print(BLEUscore) #eval_loss += BLEUscore mmm += 1 #dummy = model.caldummy(sess,_buckets, encoder_inputs, decoder_inputs, target_weights,batch_source_encoder, batch_source_decoder, bucket_id,rev_vocab=rev_vocab) #print(dummy) #eval_loss +=dummy eval_loss = eval_loss / mmm # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float( 'inf') print( "global step %d learning rate %.4f step-time %.2f loss " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, loss)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) sess.run(du_model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "weibo.model") model.saver.save(sess, checkpoint_path, global_step=model.global_step) checkpoint_path2 = os.path.join( FLAGS.train_dir2, "weibo.du_model") du_model.saver.save(sess, checkpoint_path2, global_step=model.global_step) eval_ppx = math.exp( eval_loss) if eval_loss < 300 else float('inf') summ = [dictt[w] for w in dictt] summ = 1.0 * sum(summ) print( " eval: %.5f bucket %d distinct-1 %.5f distinct-2 %.5f " % (eval_loss, bucket_id, len(dictt) / summ, len(dictt_b) / summ)) lofFile.write("%.2f %.2f\n" % (loss, eval_loss)) lofFile.flush() step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. # for bucket_id in xrange(len(_buckets)): # encoder_inputs, decoder_inputs, target_weights = model.get_batch( # dev_set, bucket_id) # _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, # target_weights, bucket_id, True) # eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') # print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def train(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \ split_data(X, y, seq_lens) FLAGS.max_sequence_length = len(train_X[0]) with tf.Session() as sess: # Load old model or create new one model = create_model(sess, FLAGS) # Train results for epoch_num, epoch in enumerate( generate_epoch(train_X, train_y, train_seq_lens, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH:", epoch_num sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) train_loss = [] train_accuracy = [] for batch_num, (batch_X, batch_y, batch_seq_lens) in enumerate(epoch): _, loss, accuracy = model.step( sess, batch_X, batch_seq_lens, batch_y, dropout_keep_prob=FLAGS.dropout_keep_prob, forward_only=False, sampling=False) train_loss.append(loss) train_accuracy.append(accuracy) print print "EPOCH %i SUMMARY" % epoch_num print "Training loss %.3f" % np.mean(train_loss) print "Training accuracy %.3f" % np.mean(train_accuracy) print "----------------------" # Validation results for valid_epoch_num, valid_epoch in enumerate( generate_epoch(valid_X, valid_y, valid_seq_lens, num_epochs=1, batch_size=FLAGS.batch_size)): valid_loss = [] valid_accuracy = [] for valid_batch_num, \ (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \ enumerate(valid_epoch): loss, accuracy = model.step(sess, valid_batch_X, valid_batch_seq_lens, valid_batch_y, dropout_keep_prob=1.0, forward_only=True, sampling=False) valid_loss.append(loss) valid_accuracy.append(accuracy) print "Validation loss %.3f" % np.mean(valid_loss) print "Validation accuracy %.3f" % np.mean(valid_accuracy) print "----------------------" # Save checkpoint every epoch. if not os.path.isdir(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt") print "Saving the model." model.saver.save(sess, checkpoint_path, global_step=model.global_step)
def build_vocab(): create_vocabulary(MODERN_VOCAB_PATH, MODERN_PATH, MODERN_VOCAB_MAX, tokenizer=tokenizer) create_vocabulary(ORIGINAL_VOCAB_PATH, ORIGINAL_PATH, ORIGINAL_VOCAB_MAX, tokenizer=tokenizer) print( subprocess.check_output(['wc', '-l', MODERN_VOCAB_PATH]) ) print( subprocess.check_output(['wc', '-l', ORIGINAL_VOCAB_PATH]) )
def train(): """Train a en->fr translation model using WMT data.""" #with tf.device("/gpu:0"): # Prepare WMT data. train_path = os.path.join(FLAGS.data_dir, "chitchat.train") fixed_path = os.path.join(FLAGS.data_dir, "chitchat.fixed") weibo_path = os.path.join(FLAGS.data_dir, "chitchat.weibo") qa_path = os.path.join(FLAGS.data_dir, "chitchat.qa") voc_file_path = [ train_path + ".answer", fixed_path + ".answer", weibo_path + ".answer", qa_path + ".answer", train_path + ".query", fixed_path + ".query", weibo_path + ".query", qa_path + ".query" ] vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.all" % FLAGS.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print("Preparing Chitchat data in %s" % FLAGS.data_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( FLAGS.data_dir, vocab, FLAGS.vocab_size) print("Preparing Fixed data in %s" % FLAGS.fixed_set_path) fixed_path = os.path.join(FLAGS.fixed_set_path, "chitchat.fixed") fixed_query, fixed_answer = data_utils.prepare_defined_data( fixed_path, vocab, FLAGS.vocab_size) print("Preparing Weibo data in %s" % FLAGS.weibo_set_path) weibo_path = os.path.join(FLAGS.weibo_set_path, "chitchat.weibo") weibo_query, weibo_answer = data_utils.prepare_defined_data( weibo_path, vocab, FLAGS.vocab_size) print("Preparing QA data in %s" % FLAGS.qa_set_path) qa_path = os.path.join(FLAGS.qa_set_path, "chitchat.qa") qa_query, qa_answer = data_utils.prepare_defined_data( qa_path, vocab, FLAGS.vocab_size) dummy_path = os.path.join(FLAGS.data_dir, "chitchat.dummy") dummy_set = data_utils.get_dummy_set(dummy_path, vocab, FLAGS.vocab_size) print("Get Dummy Set : ", dummy_set) with tf.Session() as sess: #with tf.device("/gpu:1"): # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, dummy_set, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(dev_query, dev_answer) train_set = read_data(train_query, train_answer, FLAGS.max_train_data_size) fixed_set = read_data(fixed_query, fixed_answer, FLAGS.max_train_data_size) weibo_set = read_data(weibo_query, weibo_answer, FLAGS.max_train_data_size) qa_set = read_data(qa_query, qa_answer, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] en_dict_cover = {} fr_dict_cover = {} if model.global_step.eval() > FLAGS.steps_per_checkpoint: try: with open(FLAGS.en_cover_dict_path, "rb") as ef: en_dict_cover = pickle.load(ef) # for line in ef.readlines(): # line = line.strip() # key, value = line.strip(",") # en_dict_cover[int(key)]=int(value) except Exception: print("no find query_cover_file") try: with open(FLAGS.ff_cover_dict_path, "rb") as ff: fr_dict_cover = pickle.load(ff) # for line in ff.readlines(): # line = line.strip() # key, value = line.strip(",") # fr_dict_cover[int(key)]=int(value) except Exception: print("no find answer_cover_file") step_loss_summary = tf.Summary() #merge = tf.merge_all_summaries() writer = tf.summary.FileWriter("../logs/", sess.graph) while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch( train_set, bucket_id, 0, fixed_set, weibo_set, qa_set) if FLAGS.reinforce_learning: _, step_loss, _ = model.step_rl(sess, _buckets, encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder, bucket_id) else: _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False, force_dec_input=True) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 query_size, answer_size = _buckets[bucket_id] for batch_index in xrange(FLAGS.batch_size): for query_index in xrange(query_size): query_word = encoder_inputs[query_index][batch_index] if en_dict_cover.has_key(query_word): en_dict_cover[query_word] += 1 else: en_dict_cover[query_word] = 0 for answer_index in xrange(answer_size): answer_word = decoder_inputs[answer_index][batch_index] if fr_dict_cover.has_key(answer_word): fr_dict_cover[answer_word] += 1 else: fr_dict_cover[answer_word] = 0 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: bucket_value = step_loss_summary.value.add() bucket_value.tag = "loss" bucket_value.simple_value = float(loss) writer.add_summary(step_loss_summary, current_step) print("query_dict_cover_num: %s" % (str(en_dict_cover.__len__()))) print("answer_dict_cover_num: %s" % (str(fr_dict_cover.__len__()))) ef = open(FLAGS.en_cover_dict_path, "wb") pickle.dump(en_dict_cover, ef) ff = open(FLAGS.ff_cover_dict_path, "wb") pickle.dump(fr_dict_cover, ff) # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "chitchat.model") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. # for bucket_id in xrange(len(_buckets)): # encoder_inputs, decoder_inputs, target_weights = model.get_batch( # dev_set, bucket_id) # _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, # target_weights, bucket_id, True) # eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') # print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
reply.append( data_utils.token_ids_to_sentence(out, rev_vocab)) print(reply) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() # training the model else: print("Training started") data_utils.create_vocabulary(vocab_path, data_path, FLAGS.vocab_size) train_set, train_bucket_lengths, _ = read_conversation_data( data_path, vocab_path) dev_set, dev_bucket_lengths, _ = read_conversation_data( dev_data, vocab_path) print("train_bucket_lengths") print(train_bucket_lengths) print("dev_bucket_lengths") print(dev_bucket_lengths) tf.reset_default_graph() with tf.Session() as session:
import data_utils import translate import os data_dir = "poems" in_vocabulary_size = 20000 out_vocabulary_size = 20000 train_path = "{0}/l5.train.txt".format(data_dir) dev_path = "{0}/l5.valid.txt".format(data_dir) # Create vocabularies of the appropriate sizes. '''in_vocab_path = os.path.join(data_dir, "vocab%d.in" % in_vocabulary_size) out_vocab_path = os.path.join(data_dir, "vocab%d.out" % out_vocabulary_size) data_utils.create_vocabulary(in_vocab_path, train_path + ".in", in_vocabulary_size) data_utils.create_vocabulary(out_vocab_path, train_path + ".out", out_vocabulary_size) in_train_ids_path = train_path + (".ids%d.in" % in_vocabulary_size) out_train_ids_path = train_path + (".ids%d.out" % out_vocabulary_size) data_utils.data_to_token_ids(train_path + ".in", in_train_ids_path, in_vocab_path) data_utils.data_to_token_ids(train_path + ".out", out_train_ids_path, out_vocab_path) prepare_poem_data(data_dir, in_vocabulary_size, out_vocabulary_size, tokenizer=None)''' in_train, out_train, in_dev, out_dev, _, _ = data_utils.prepare_poem_data( data_dir, in_vocabulary_size, out_vocabulary_size, line_based=True, skip_thought=True) train_set = translate.read_data(in_train, out_train) print(train_set)