def run_epoch(sess, model, data): epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() # accumulated counts costs = 0.0 iters = 0 # initial RNN state state = model.initial_state.eval() for step, (x, y) in enumerate(ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed_dict={ model.input_data: x, model.targets: y, model.initial_state: state }) costs += cost iters += model.num_steps perplexity = np.exp(costs / iters) if step % 100 == 0: break return (costs / iters), perplexity
def test(test_data, verbose=0): model = load_model('weights/my_model.h5') # model = get_model() # model.load_weights('my_model_weights.h5') acc = 0.0 siz = 0 perplexity = [] for step, (x, y) in enumerate( ptb_reader.ptb_iterator(test_data, dataset_size, num_steps)): x1, y1 = one_hot(x, y[:, -1]) output = model.predict(x1, verbose=verbose) score, accuracy = model.evaluate(x1, y1, verbose=1, batch_size=10) perplexity.append(np.power(accuracy, 2)) siz += 1 print('') print('Step: ', step + 1, end='') print(', Test accuracy:', accuracy) acc += accuracy print('Average Accuracy: ', acc / siz) return np.mean(perp_np)
def run_epoch(sess, model, data, verbose=False): epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() # accumulated counts costs = 0.0 iters = 0 # initial RNN state state = model.initial_state.eval() for step, (x, y) in enumerate( ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed_dict={ model.input_data: x, model.targets: y, model.initial_state: state }) costs += cost iters += model.num_steps perplexity = np.exp(costs / iters) if verbose and step % 10 == 0: progress = (step / epoch_size) * 100 wps = iters * model.batch_size / (time.time() - start_time) print("%.1f%% Perplexity: %.3f (Cost: %.3f) Speed: %.0f wps" % (progress, perplexity, cost, wps)) return (costs / iters), perplexity
def run_epoch(sess, model, data, verbose=False): epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() # accumulated counts costs = 0.0 iters = 0 # initial RNN state state = model.initial_state.eval() for step, (x, y) in enumerate(ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed_dict={ model.input_data: x, model.targets: y, model.initial_state: state }) costs += cost iters += model.num_steps perplexity = np.exp(costs / iters) if verbose and step % 10 == 0: progress = (step / epoch_size) * 100 wps = iters * model.batch_size / (time.time() - start_time) print("%.1f%% Perplexity: %.3f (Cost: %.3f) Speed: %.0f wps" % (progress, perplexity, cost, wps)) return (costs / iters), perplexity
def train(train_data, verbose=0, model=None): if model is None: model = get_model() if (verbose > 0): print('Train...') for step, (x, y) in enumerate( ptb_reader.ptb_iterator(train_data, dataset_size, num_steps)): # x1 = np.zeros((dataset_size, num_steps, feat_len)) # for i in range(x1.shape[0]): # for j in range(x1.shape[1]): # x1[i,j,x[i,j]] = 1 # y1 = y[:,-1] # y1 = np.zeros((dataset_size, feat_len)) # for i in range(y1.shape[0]): # y1[i,y[i,-1]] = 1 x1, y1 = one_hot(x, y[:, -1]) model.fit(x1, y1, epochs=10, verbose=verbose, batch_size=10) if (step % 100 == 0 and verbose > 0): print(step + 1, end=' ') break if (not os.path.isdir('weights')): os.mkdir('weights') model.save('weights/my_model.h5')
def run_epoch(session, model, data, eval_op=None, verbose=False): epoch_size = ((len(data) // model.batch_size) - 1) // model.seq_length start_time = time.time() costs = 0.0 iters = 0 state = session.run(model.initial_state) for step, (x, y) in enumerate( reader.ptb_iterator(data, model.batch_size, model.seq_length)): fetches = [model.cost, model.final_state, eval_op] feed_dict = {} feed_dict[model.input_data] = x feed_dict[model.targets] = y for i, (z, z_mean, z_log_sigma_sq) in enumerate(model.initial_state): feed_dict[z] = state[i].z feed_dict[z_mean] = state[i].z_mean feed_dict[z_log_sigma_sq] = state[i].z_log_sigma_sq cost, state, _ = session.run(fetches, feed_dict) costs += cost iters += model.seq_length if verbose and step % (epoch_size // 10) == 10: print('Progress: %.3f; Perplexity: %.3f; Speed: %.0f wps' % (step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters)
def run_epoch(sess, model, data): epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps saver = tf.train.Saver() #初始化模型参数 state = sess.run(model.initial_state) total_cost = 0 iterations = 0 for step, (x, y) in enumerate( ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed_dict={ model.input_data: x, model.targets: y, model.initial_state: state }) total_cost += cost iterations += model.num_steps perplexity = np.exp(total_cost / iterations) if step % 100 == 0: progress = (step * 1.0 / epoch_size) * 100 print("%.1f%% Perplexity: %.3f (Cost: %.3f) " % (progress, perplexity, cost)) save_path = saver.save(sess, "./saved_model_rnn/lstm-model.ckpt") return (total_cost / iterations), perplexity
def train(data): # saver = tf.train.import_meta_graph('weights/ptb_lstm_model.meta') # saver.restore(sess, 'weights/ptb_lstm_model') epoch_size = ((len(data) // batch_size) - 1) // lstm_steps loss1 = 0.0 iters = 0 test_perplexity_prev = 10000000.0 tf.global_variables_initializer().run() for ep in range(epochs): for step,(x,y) in enumerate(ptb_reader.ptb_iterator(data,batch_size,lstm_steps)): loss_temp,_ = sess.run([loss,train_step],feed_dict={inputs:x,targets:y}) loss1 += loss_temp iters += lstm_steps perplexity = np.exp(loss1/iters) # if step%10==0: progress = (step/float(epoch_size))*100.0 print("%d %.1f%% Perplexity: %.3f (Loss: %.3f)" % (ep,progress, perplexity,loss1/iters)) # saver.save(sess,'weights/ptb_lstm_model') # print 'Trained model saved' # test_perplexity = test(test_data) # print("Test Perplexity: %.3f" % test_perplexity) loss_test = 0.0 iters_test = 0 for step,(x,y) in enumerate(ptb_reader.ptb_iterator(test_data,batch_size,lstm_steps)): print step loss_temp = sess.run(loss,feed_dict={inputs:x,targets:y}) loss_test += loss_temp iters_test += lstm_steps test_perplexity = np.exp(loss_test/iters_test) if test_perplexity>test_perplexity_prev: break print("Test Perplexity: %.3f" % test_perplexity) test_perplexity_prev = test_perplexity saver.save(sess,'weights/ptb_lstm_model') print 'Trained model saved' return perplexity,loss1/iters
def run_epoch(session, m, data, eval_op, verbose=False): """Runs the model on the given data""" epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps ##the epoch_size here equals to the number of iteration! start_time = time.time() costs = 0.0 iters = 0 #print("state one:") #print(m.initial_state[0]) #print("state two") #state = m.initial_state[1] #print(state) #print("the whole state") state = m._initial_state.eval() #print(tf.shape(m._initial_state)) #print(m.initial_state) #print("2222222222222222") # print(state[0]) # print(data) # print(len(data)) for step, (x, y) in enumerate( ptb_reader.ptb_iterator(data, m.batch_size, m.num_steps)): #print("y!!!!!!!!!!!!!!!!!!!!!!!") #print(step) #print(x) #print(y) cost, state, inputs, output, outputs, __ = session.run( [m.cost, m.final_state, m.inputs, m.output, m.outputs, eval_op], { m.input_data: x, m.targets: y, m._initial_state: state }) #print("13333232323!!!") #print(tf.shape(y).dims) #print(tf.shape(output)) costs += cost iters += m.num_steps if verbose and step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time))) tvars = tf.trainable_variables() print("printing all traiinable vairable for time steps", m.num_steps) for tvar in tvars: print(tvar.name, tvar.initialized_value()) return np.exp(costs / iters)
def run_epoch(session, model, data, train_op, output_log): total_costs = 0.0 iters = 0 state = session.run(model.initial_state) for step, (x, y) in enumerate(ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state, _ = session.run([model.cost, model.final_state, train_op], {model.input_data: x, model.targets: y, model.initial_state: state}) total_costs += cost iters += model.num_steps if output_log and step % 100 == 0: print('After %d steps,perplexity is %.3f' % (step, np.exp(total_costs / iters))) return np.exp(total_costs / iters)
def test(data): saver = tf.train.import_meta_graph('weights/ptb_lstm_model.meta') saver.restore(sess, 'weights/ptb_lstm_model') epoch_size = ((len(data) // batch_size) - 1) // lstm_steps loss1 = 0.0 iters = 0 # tf.global_variables_initializer().run() for step,(x,y) in enumerate(ptb_reader.ptb_iterator(data,batch_size,lstm_steps)): # print step loss_temp = sess.run(loss,feed_dict={inputs:x,targets:y}) loss1 += loss_temp iters += lstm_steps perplexity = np.exp(loss1/iters) return perplexity
def test_epoch(sess, model, data): saver = tf.train.Saver() saver.restore(sess, "./saved_model_rnn/lstm-model.ckpt") state = sess.run(model.initial_state) total_cost = 0 iterations = 0 epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps for step, (x, y) in enumerate( ptb_reader.ptb_iterator(data, model.batch_size, model.num_steps)): cost, state = sess.run([model.cost, model.final_state], feed_dict={ model.input_data: x, model.targets: y, model.initial_state: state }) total_cost += cost iterations += model.num_steps perplexity = np.exp(total_cost / iterations) return (total_cost / iterations), perplexity
def run_epoch(session, m, data, eval_op, verbose=False, vocabulary=None): """ :param session for computation :param m model object :param data input data :param eval_op :param verbose :param vocabulary Runs the model on the given data.""" epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps start_time = time.time() costs = 0.0 iters = 0 state = m.initial_state.eval() for step, (x, y) in enumerate( ptb_reader.ptb_iterator(data, m.batch_size, m.num_steps)): cost, state, probs, logits, _ = session.run( [m.cost, m.final_state, m.probabilities, m.logits, eval_op], { m.input_data: x, m.targets: y, m.initial_state: state }) costs += cost iters += m.num_steps if verbose and step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time))) chosen_word = np.argmax(probs, 1) print("Probabilities shape: %s, Logits shape: %s" % (probs.shape, logits.shape)) print(chosen_word) if vocabulary is not None: next_word_id = chosen_word[-1] for word_, word_id_ in vocabulary.iteritems(): if word_id_ == next_word_id: print(word_) print("Batch size: %s, Num steps: %s" % (m.batch_size, m.num_steps)) return np.exp(costs / iters)
def run_epoch(session, m, data, eval_op, verbose=False): """Runs the model on the given data.""" epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps start_time = time.time() costs = 0.0 iters = 0 state = m.initial_state.eval() for step, (x, y) in enumerate( reader.ptb_iterator(data, m.batch_size, m.num_steps)): cost, state, _ = session.run([m.cost, m.final_state, eval_op], { m.input_data: x, m.targets: y, m.initial_state: state }) costs += cost iters += m.num_steps if verbose and step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time))) return np.exp(costs / iters)
def runepoch(sess, data, modeldict, fetches, epoch_no, verbose): lr_decay = decay**max(epoch_no - 4, 0.0) sess.run(tf.assign(modeldict['lr'], learning_rate * lr_decay)) state = sess.run(modeldict['initial_state']) losses = 0.0 itercnt = 0 if verbose: print('Running New Epoch') for curr, (x, y) in enumerate( ptb_reader.ptb_iterator(data, flags.batchsize, flags.numsteps)): feed_dict = { modeldict['X']: x, modeldict['Y']: y, modeldict['initial_state']: state } vals = sess.run(fetches, feed_dict) losses += vals['loss'] * flags.numsteps state = vals['final_state'] itercnt += flags.numsteps if curr % 100 == 0 and verbose: print('Curr: ', curr, ' | Perplexity: ', np.exp(losses / itercnt)) if verbose: print('Epoch Complete') return np.exp(losses / itercnt)
import ptb_reader as pr source = "C:\\ptb\\ptb\\data" train_data, valid_data, test_data, word_to_id, id_to_word = pr.ptb_raw_data( source) for step, (x, y) in enumerate(pr.ptb_iterator(train_data, 40, 20)): print("y!!!!!!!!!!!!!!!!!!!!!!!") print(step) print(x) print(y)
def _main(_): # Data batch_size = config.batch_size memory_size = config.memory_size terminating_learning_rate = config.terminating_learning_rate data = prepare_data(FLAGS.data_path) vocab_size = data["vocab_size"] print('vocab_size = {}'.format(vocab_size)) inputs = tf.placeholder(tf.int32, [None, memory_size], name="inputs") targets = tf.placeholder(tf.int32, [None], name="targets") # Model architecture initializer = tf.random_normal_initializer(stddev=config.initialize_stddev) with tf.variable_scope("model", initializer=initializer): memnet = tx.modules.MemNetRNNLike(raw_memory_dim=vocab_size, hparams=config.memnet) queries = tf.fill([tf.shape(inputs)[0], config.dim], config.query_constant) logits = memnet(inputs, queries) # Losses & train ops mle_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits) mle_loss = tf.reduce_sum(mle_loss) # Use global_step to pass epoch, for lr decay lr = config.opt["optimizer"]["kwargs"]["learning_rate"] learning_rate = tf.placeholder(tf.float32, [], name="learning_rate") global_step = tf.Variable(0, dtype=tf.int32, name="global_step") increment_global_step = tf.assign_add(global_step, 1) train_op = tx.core.get_train_op(mle_loss, learning_rate=learning_rate, global_step=global_step, increment_global_step=False, hparams=config.opt) def _run_epoch(sess, data_iter, epoch, is_train=False): loss = 0. iters = 0 fetches = {"mle_loss": mle_loss} if is_train: fetches["train_op"] = train_op mode = (tf.estimator.ModeKeys.TRAIN if is_train else tf.estimator.ModeKeys.EVAL) for _, (x, y) in enumerate(data_iter): batch_size = x.shape[0] feed_dict = { inputs: x, targets: y, learning_rate: lr, tx.global_mode(): mode, } rets = sess.run(fetches, feed_dict) loss += rets["mle_loss"] iters += batch_size ppl = np.exp(loss / iters) return ppl saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) try: saver.restore(sess, "ckpt/model.ckpt") print('restored checkpoint.') except: print('restore checkpoint failed.') last_valid_ppl = None heuristic_lr_decay = (hasattr(config, 'heuristic_lr_decay') and config.heuristic_lr_decay) while True: if lr < terminating_learning_rate: break epoch = sess.run(global_step) if epoch >= config.num_epochs: print('Too many epochs!') break print('epoch: {} learning_rate: {:.6f}'.format(epoch, lr)) # Train train_data_iter = ptb_iterator(data["train_text_id"], batch_size, memory_size) train_ppl = _run_epoch(sess, train_data_iter, epoch, is_train=True) print("Train Perplexity: {:.3f}".format(train_ppl)) sess.run(increment_global_step) # checkpoint if epoch % 5 == 0: try: saver.save(sess, "ckpt/model.ckpt") print("saved checkpoint.") except: print("save checkpoint failed.") # Valid valid_data_iter = ptb_iterator(data["valid_text_id"], batch_size, memory_size) valid_ppl = _run_epoch(sess, valid_data_iter, epoch) print("Valid Perplexity: {:.3f}".format(valid_ppl)) # Learning rate decay if last_valid_ppl: if heuristic_lr_decay: if valid_ppl > last_valid_ppl * config.heuristic_threshold: lr /= 1. + (valid_ppl / last_valid_ppl \ - config.heuristic_threshold) \ * config.heuristic_rate last_valid_ppl = last_valid_ppl \ * (1 - config.heuristic_smooth_rate) \ + valid_ppl * config.heuristic_smooth_rate else: if valid_ppl > last_valid_ppl: lr /= config.learning_rate_anneal_factor last_valid_ppl = valid_ppl else: last_valid_ppl = valid_ppl print("last_valid_ppl: {:.6f}".format(last_valid_ppl)) epoch = sess.run(global_step) print('Terminate after epoch ', epoch) # Test test_data_iter = ptb_iterator(data["test_text_id"], 1, memory_size) test_ppl = _run_epoch(sess, test_data_iter, 0) print("Test Perplexity: {:.3f}".format(test_ppl))
def _main(_): # Data batch_size = config.batch_size num_steps = config.num_steps data = prepare_data(FLAGS.data_path) vocab_size = data["vocab_size"] inputs = tf.placeholder(tf.int32, [batch_size, num_steps]) targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Model architecture initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", initializer=initializer): embedder = tx.modules.WordEmbedder(vocab_size=vocab_size, hparams=config.emb) emb_inputs = embedder(inputs) if config.keep_prob < 1: emb_inputs = tf.nn.dropout( emb_inputs, tx.utils.switch_dropout(config.keep_prob)) decoder = tx.modules.BasicRNNDecoder(vocab_size=vocab_size, hparams={"rnn_cell": config.cell}) initial_state = decoder.zero_state(batch_size, tf.float32) outputs, final_state, seq_lengths = decoder( decoding_strategy="train_greedy", impute_finished=True, inputs=emb_inputs, sequence_length=[num_steps] * batch_size, initial_state=initial_state) # Losses & train ops mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy( labels=targets, logits=outputs.logits, sequence_length=seq_lengths) # Use global_step to pass epoch, for lr decay global_step = tf.placeholder(tf.int32) train_op = tx.core.get_train_op(mle_loss, global_step=global_step, increment_global_step=False, hparams=config.opt) def _run_epoch(sess, data_iter, epoch, is_train=False, verbose=False): start_time = time.time() loss = 0. iters = 0 state = sess.run(initial_state) fetches = { "mle_loss": mle_loss, "final_state": final_state, } if is_train: fetches["train_op"] = train_op epoch_size = (len(data["train_text_id"]) // batch_size - 1)\ // num_steps mode = (tf.estimator.ModeKeys.TRAIN if is_train else tf.estimator.ModeKeys.EVAL) for step, (x, y) in enumerate(data_iter): feed_dict = { inputs: x, targets: y, global_step: epoch, tx.global_mode(): mode, } for i, (c, h) in enumerate(initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h rets = sess.run(fetches, feed_dict) loss += rets["mle_loss"] state = rets["final_state"] iters += num_steps ppl = np.exp(loss / iters) if verbose and is_train and step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % ((step + 1) * 1.0 / epoch_size, ppl, iters * batch_size / (time.time() - start_time))) ppl = np.exp(loss / iters) return ppl with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) for epoch in range(config.num_epochs): # Train train_data_iter = ptb_iterator(data["train_text_id"], config.batch_size, num_steps) train_ppl = _run_epoch(sess, train_data_iter, epoch, is_train=True, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (epoch, train_ppl)) # Valid valid_data_iter = ptb_iterator(data["valid_text_id"], config.batch_size, num_steps) valid_ppl = _run_epoch(sess, valid_data_iter, epoch) print("Epoch: %d Valid Perplexity: %.3f" % (epoch, valid_ppl)) # Test test_data_iter = ptb_iterator(data["test_text_id"], batch_size, num_steps) test_ppl = _run_epoch(sess, test_data_iter, 0) print("Test Perplexity: %.3f" % (test_ppl))
def _main(_): # Data tf.logging.set_verbosity(tf.logging.INFO) # 1. initialize the horovod hvd.init() batch_size = config.batch_size num_steps = config.num_steps data = prepare_data(FLAGS.data_path) vocab_size = data["vocab_size"] inputs = tf.placeholder(tf.int32, [None, num_steps], name='inputs') targets = tf.placeholder(tf.int32, [None, num_steps], name='targets') # Model architecture initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", initializer=initializer): embedder = tx.modules.WordEmbedder(vocab_size=vocab_size, hparams=config.emb) emb_inputs = embedder(inputs) if config.keep_prob < 1: emb_inputs = tf.nn.dropout( emb_inputs, tx.utils.switch_dropout(config.keep_prob)) decoder = tx.modules.BasicRNNDecoder(vocab_size=vocab_size, hparams={"rnn_cell": config.cell}) # This _batch_size equals to batch_size // hvd.size() in # distributed training. # because the mini-batch is distributed to multiple GPUs _batch_size = tf.shape(inputs)[0] initial_state = decoder.zero_state(_batch_size, tf.float32) seq_length = tf.broadcast_to([num_steps], (_batch_size, )) outputs, final_state, seq_lengths = decoder( decoding_strategy="train_greedy", impute_finished=True, inputs=emb_inputs, sequence_length=seq_length, initial_state=initial_state) # Losses & train ops mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy( labels=targets, logits=outputs.logits, sequence_length=seq_lengths) # Use global_step to pass epoch, for lr decay global_step = tf.placeholder(tf.int32) opt = tx.core.get_optimizer(global_step=global_step, hparams=config.opt) # 2. wrap the optimizer opt = hvd.DistributedOptimizer(opt) train_op = tx.core.get_train_op(loss=mle_loss, optimizer=opt, global_step=global_step, learning_rate=None, increment_global_step=False, hparams=config.opt) def _run_epoch(sess, data_iter, epoch, is_train=False, verbose=False): start_time = time.time() loss = 0. iters = 0 fetches = { "mle_loss": mle_loss, "final_state": final_state, } if is_train: fetches["train_op"] = train_op epoch_size = (len(data["train_text_id"]) // batch_size - 1)\ // num_steps mode = (tf.estimator.ModeKeys.TRAIN if is_train else tf.estimator.ModeKeys.EVAL) for step, (x, y) in enumerate(data_iter): if step == 0: state = sess.run(initial_state, feed_dict={inputs: x}) feed_dict = { inputs: x, targets: y, global_step: epoch, tx.global_mode(): mode, } for i, (c, h) in enumerate(initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h rets = sess.run(fetches, feed_dict) loss += rets["mle_loss"] state = rets["final_state"] iters += num_steps ppl = np.exp(loss / iters) if verbose and is_train and hvd.rank() == 0 \ and (step + 1) % (epoch_size // 10) == 0: tf.logging.info( "%.3f perplexity: %.3f speed: %.0f wps" % ((step + 1) * 1.0 / epoch_size, ppl, iters * batch_size / (time.time() - start_time))) _elapsed_time = time.time() - start_time tf.logging.info("epoch time elapsed: %f" % (_elapsed_time)) ppl = np.exp(loss / iters) return ppl, _elapsed_time # 3. set broadcase global variables from rank-0 process bcast = hvd.broadcast_global_variables(0) # 4. set visible GPU session_config = tf.ConfigProto() session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) # 5. run the broadcast_global_variables node before training bcast.run() _times = [] for epoch in range(config.num_epochs): # Train train_data_iter = ptb_iterator(data["train_text_id"], config.batch_size, num_steps, is_train=True) train_ppl, train_time = _run_epoch(sess, train_data_iter, epoch, is_train=True, verbose=True) _times.append(train_time) tf.logging.info("Epoch: %d Train Perplexity: %.3f" % (epoch, train_ppl)) # Valid in the main process if hvd.rank() == 0: valid_data_iter = ptb_iterator(data["valid_text_id"], config.batch_size, num_steps) valid_ppl, _ = _run_epoch(sess, valid_data_iter, epoch) tf.logging.info("Epoch: %d Valid Perplexity: %.3f" % (epoch, valid_ppl)) tf.logging.info('train times: %s' % (_times)) tf.logging.info('average train time/epoch %f' % np.mean(np.array(_times))) # Test in the main process if hvd.rank() == 0: test_data_iter = ptb_iterator(data["test_text_id"], batch_size, num_steps) test_ppl, _ = _run_epoch(sess, test_data_iter, 0) tf.logging.info("Test Perplexity: %.3f" % (test_ppl))