saver.restore(sess, ckpt.model_checkpoint_path) current_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) print(current_step) else: print('Created new model parameters..') current_step = 0 total_loss = 0 total_bleu = 0 total_perpl = 0 summary_writer = tf.summary.FileWriter(config.LOGS, graph=sess.graph) for e in range(config.EPOCHS): print("----- Epoch {}/{} -----".format(e + 1, config.EPOCHS)) batches = getBatches(trainingSamples, config.BATCH_SIZE) if e != 0: checkpoint_path = os.path.join(config.MODEL_DIR, config.MODEL_NAME) saver.save(sess, checkpoint_path, e) for nextBatch in tqdm(batches, desc="Training"): if current_step == 150000: break else: loss, summary, pred, target = model.train(sess, nextBatch) bleu = get_bleu(target, pred) total_loss += loss total_perpl += 2**(float(loss)) total_bleu += bleu current_step += 1
tf.app.flags.DEFINE_string('model_name', 'chatbot.ckpt', 'File name used for model checkpoints') FLAGS = tf.app.flags.FLAGS data_path = '/Users/shengwan/Desktop/seq2seq_chatbot_new-master/data/dataset-cornell-length10-filter1-vocabSize40000.pkl' word2id, id2word, trainingSamples = loadDataset(data_path) with tf.Session() as sess: model = Seq2SeqModel(FLAGS.rnn_size, FLAGS.num_layers, FLAGS.embedding_size, FLAGS.learning_rate, word2id, mode='train', use_attention=True, beam_search=False, beam_size=5, max_gradient_norm=5.0) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters from ', ckpt.model_checkpoint_path) sess = model.restore_last_session() else: print('Created new model parameters..') sess.run(tf.global_variables_initializer()) current_step = 0 summary_writer = tf.summary.FileWriter(FLAGS.model_dir, graph=sess.graph) for e in range(FLAGS.numEpochs): print("----- Epoch {}/{} -----".format(e + 1, FLAGS.numEpochs)) batches = getBatches(trainingSamples, FLAGS.batch_size) for nextBatch in tqdm(batches, desc="Training"): loss, summary = model.train(sess, nextBatch) current_step += 1 if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp(float(loss)) if loss < 300 else float('inf') tqdm.write("----- Step %d -- Loss %.2f -- Perplexity %.2f" % (current_step, loss, perplexity)) summary_writer.add_summary(summary, current_step) checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name) model.saver.save(sess, checkpoint_path, global_step=current_step)
ckpt = tf.train.get_checkpoint_state(model_dir) # 如果模型存在 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') # 使用saver.restore()方法恢复变量 model.saver.restore(sess, ckpt.model_checkpoint_path) else: # 找不到模型 raise ValueError('No such file:[{}]'.format(model_dir)) # 报错 else: # 从零开始训练模型 sess.run(tf.global_variables_initializer()) for e in range(epochs): # 对于每一个epoch # 打印训练Epoch信息 print("----- Epoch {}/{} -----".format(e + 1, epochs)) # 将一个epoch中的数据制作成一个个batch,具体方法进入data_helpers.py查看 batches = getBatches(sources_data, targets_data, batch_size) step = 0 # 记录训练了多少个batch for nextBatch in batches: # 遍历batches中的每个batch # 将这个batch的数据喂给网络进行训练 loss, summary = model.train(nextBatch) if step % display == 0: # 每隔display个batch打印一次训练信息 # math.exp(x)返回e的x次方 # inf表示正无穷 # 计算困惑度 perplexity = math.exp( float(loss)) if loss < 300 else float('inf') # 打印loss和困惑度 print("----- Loss %.2f -- Perplexity %.2f" % (loss, perplexity)) step += 1 # 计数,记录训练了多少个batch # 保存当前网络模型参数