def test_read_train_dev_test(): data_dir = os.path.join(root_dir, "data/ptb") train_path = os.path.join(data_dir, "train") dev_path = os.path.join(data_dir, "valid") test_path = os.path.join(data_dir, "test") cache_dir = os.path.join(root_dir, "data/ptb/cache") vocab_size = 20000 if not os.path.exists(cache_dir): os.mkdir(cache_dir) train_data_bucket, dev_data_bucket, _buckets, vocab_path = data_util.read_train_dev( cache_dir, train_path, dev_path, vocab_size, 100, 10) test_data_bucket, _buckets_test = data_util.read_test( cache_dir, test_path, vocab_path, vocab_size, 100, 10) def print_bucket_data(data): l = [len(x) for x in data] print l print "_buckets: {}\n".format(_buckets) print_bucket_data(train_data_bucket) print_bucket_data(dev_data_bucket) print "_buckets_test: {}\n".format(_buckets_test) print_bucket_data(test_data_bucket)
def train(): #1.读入train数据和dev数据 mylog_section('READ DATA') train_data_bucket, dev_data_bucket, _buckets, vocab_path = read_train_dev( FLAGS.data_cache_dir, FLAGS.train_path, FLAGS.dev_path, FLAGS.vocab_size, FLAGS.L, FLAGS.n_bucket) ##########以下是打印需要的信息 start ##################### real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size # 计算总共要处理的tokens个数 train_n_tokens = np.sum([ np.sum([len(sentence) for sentence in bucket]) for bucket in train_data_bucket ]) # train_data_bucket train_bucket_sizes = [ len(train_data_bucket[index]) for index in xrange(len(_buckets)) ] train_total_size = float(sum(train_bucket_sizes)) # 计算累计值,用于计算bucket,在 data_iterator中随机生成一个0-1的数,这里的train_buckets_scale根据每个bucket中句子数量的不同,切分成不同的权重[0.1,0.3,0.5,0.8,1] # 当随机的0-1的数落到上述权重的某个区间,那么就选哪个bucket。 train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] dev_bucket_sizes = [ len(dev_data_bucket[index]) for index in xrange(len(_buckets)) ] dev_total_size = int(sum(dev_bucket_sizes)) mylog_section("REPORT") # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size) steps_per_checkpoint = int(steps_per_epoch / 2) #每半个epoch 验证一次模型 total_steps = steps_per_epoch * n_epoch # reports mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets: {}".format(FLAGS._buckets)) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) ##########打印需要的信息 end ##################### mylog_section("IN TENSORFLOW") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL/SUMMARY/WRITER") mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, run_options, run_metadata) mylog_section("All Variables") show_all_variables() # Data Iterators mylog_section("Data Iterators") dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("Itetype: withRandom") ite = dite.next_random() elif iteType == 1: mylog("Itetype: withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 current_step = 0 low_ppx = float("inf") steps_per_report = 30 n_targets_report = 0 report_time = 0 n_valid_sents = 0 n_valid_words = 0 patience = FLAGS.patience mylog_section("TRAIN") while current_step < total_steps: # start start_time = time.time() # data and train inputs, outputs, weights, bucket_id = ite.next() #训练数据 L = model.step(sess, inputs, outputs, weights, bucket_id) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 # 此处 weights 等数据的格式是 len(weights) == 句子长度 # len(weights[0]) 是 batch size n_valid_sents += np.sum(np.sign(weights[0])) n_valid_words += np.sum(weights) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(weights) #显示信息 if current_step % steps_per_report == 0: sect_name = "STEP {}".format(current_step) msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format( report_time / steps_per_report, n_targets_report * 1.0 / report_time, train_n_tokens) mylog_line(sect_name, msg) report_time = 0 n_targets_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() #达到半个epoch,计算ppx(dev) if current_step % steps_per_checkpoint == 0: i_checkpoint = int(current_step / steps_per_checkpoint) # train_ppx loss = loss / n_valid_words train_ppx = math.exp( float(loss)) if loss < 300 else float("inf") learning_rate = model.learning_rate.eval() # dev_ppx dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket) # report sect_name = "CHECKPOINT {} STEP {}".format( i_checkpoint, current_step) msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format( learning_rate, dev_ppx, train_ppx) mylog_line(sect_name, msg) # save model per checkpoint if FLAGS.saveCheckpoint: checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model") s = time.time() model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) # save best model if dev_ppx < low_ppx: patience = FLAGS.patience low_ppx = dev_ppx checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best") s = time.time() model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) else: patience -= 1 #每次当 dev_ppx >= low_ppx时 学习步长减半 sess.run(model.learning_rate_decay_op) msg = 'dev_ppx:{}, low_ppx:{}'.format( str(dev_ppx), str(low_ppx)) mylog_line(sect_name, msg) msg = 'dev_ppx >= low_ppx,patience ={}, learning_reate ={}'.format( str(patience), str(model.learning_rate.eval())) mylog_line(sect_name, msg) if patience <= 0: mylog("Training finished. Running out of patience.") break # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
def train(): # Read Data mylog_section("READ DATA") train_data_bucket, dev_data_bucket, _buckets, vocab_path = read_train_dev( FLAGS.data_cache_dir, FLAGS.train_path, FLAGS.dev_path, FLAGS.vocab_size, FLAGS.L, FLAGS.n_bucket) # 执行到此处, train_data_bucket,dev_data_bucket,_buckets 长度相同 # train_data_bucket,dev_data_bucket 都是 [b1,b2,b3, ..., bn] 格式 # 每个 bi 中都是化为数字的 sentence # _buckets [2,4,5] 类似,是分割的句子长度 real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size # 计算总共要处理的tokens个数 train_n_tokens = np.sum( [np.sum([len(items) for items in x]) for x in train_data_bucket]) # train_data_bucket train_bucket_sizes = [ len(train_data_bucket[b]) for b in range(len(_buckets)) ] train_total_size = float(sum(train_bucket_sizes)) # 计算累计值 train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] dev_bucket_sizes = [len(dev_data_bucket[b]) for b in range(len(_buckets))] dev_total_size = int(sum(dev_bucket_sizes)) mylog_section("REPORT") # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size) steps_per_dev = int(dev_total_size / batch_size) steps_per_checkpoint = int(steps_per_epoch / 2) total_steps = steps_per_epoch * n_epoch # reports mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets: {}".format(FLAGS._buckets)) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) mylog_section("IN TENSORFLOW") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL/SUMMARY/WRITER") mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, run_options, run_metadata) mylog("Creating ModelSummary") modelSummary = ModelSummary() mylog("Creating tf.summary.FileWriter") summaryWriter = tf.summary.FileWriter( os.path.join(FLAGS.summary_dir, "train.summary"), sess.graph) mylog_section("All Variables") show_all_variables() # Data Iterators mylog_section("Data Iterators") dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("Itetype: withRandom") ite = dite.next_random() elif iteType == 1: mylog("Itetype: withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] low_ppx = float("inf") low_ppx_step = 0 steps_per_report = 30 n_targets_report = 0 report_time = 0 n_valid_sents = 0 n_valid_words = 0 patience = FLAGS.patience mylog_section("TRAIN") while current_step < total_steps: # start start_time = time.time() # data and train inputs, outputs, weights, bucket_id = next(ite) L = model.step(sess, inputs, outputs, weights, bucket_id) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 # 此处 weights 等数据的格式是 len(weights) == 句子长度 # len(weights[0]) 是 batch size n_valid_sents += np.sum(np.sign(weights[0])) n_valid_words += np.sum(weights) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(weights) if current_step % steps_per_report == 0: sect_name = "STEP {}".format(current_step) msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format( report_time / steps_per_report, n_targets_report * 1.0 / report_time, train_n_tokens) mylog_line(sect_name, msg) report_time = 0 n_targets_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() if current_step % steps_per_checkpoint == 0: i_checkpoint = int(current_step / steps_per_checkpoint) # train_ppx loss = loss / n_valid_words train_ppx = math.exp( float(loss)) if loss < 300 else float("inf") learning_rate = model.learning_rate.eval() # dev_ppx dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket) # report sect_name = "CHECKPOINT {} STEP {}".format( i_checkpoint, current_step) msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format( learning_rate, dev_ppx, train_ppx) mylog_line(sect_name, msg) # save summary _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx) for _summary in _summaries: summaryWriter.add_summary(_summary, i_checkpoint) # save model per checkpoint if FLAGS.saveCheckpoint: checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model") s = time.time() model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) # save best model if dev_ppx < low_ppx: patience = FLAGS.patience low_ppx = dev_ppx low_ppx_step = current_step checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best") s = time.time() model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) else: patience -= 1 if patience <= 0: mylog("Training finished. Running out of patience.") break # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0