def evaluate(): # Read vocabulary vocab, rev_vocab = _load_vocabulary(FLAGS.vocab_fname) with tf.Graph().as_default() as g: #Enque data for evaluation num_examples_per_epoch, tower_img_embedding, tower_context_length, \ tower_caption_length, tower_context_id, tower_caption_id, \ tower_answer_id, tower_context_mask, \ tower_caption_mask = enqueue(True) tower_argmax = [] # Calculate the gradients for each model tower. with tf.variable_scope(tf.get_variable_scope()) as scope: for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: inputs = [ tower_img_embedding[i], tower_context_length[i], tower_caption_length[i], tower_context_id[i], tower_caption_id[i], tower_answer_id[i], tower_context_mask[i], tower_caption_mask[i] ] net = CSMN(inputs, ModelConfig(FLAGS, True), is_training= False) argmax = net.argmax # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Keep track of the gradients across all towers. tower_argmax.append(argmax) argmaxs = tf.concat(tower_argmax, 0) answer_ids = tf.concat(tower_answer_id, 0) saver = tf.train.Saver(tf.global_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) #Don't evaluate again for the same checkpoint. b_g_s = "0" while True: c_g_s = _eval_once( saver, summary_writer, argmaxs, answer_ids, vocab, rev_vocab, num_examples_per_epoch, b_g_s ) b_g_s = c_g_s if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(): print('training...') colorlog.basicConfig( filename=None, level=logging.INFO, format="%(log_color)s[%(levelname)s:%(asctime)s]%(reset)s %(message)s", datafmt="%Y-%m-%d %H:%M:%S") gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95) gpu_options.allow_growth = True with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) as sess: global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_examples_per_epoch, tower_img_embedding, tower_context_length, \ tower_caption_length, tower_context_id, tower_caption_id, \ tower_answer_id, tower_context_mask, \ tower_caption_mask = enqueue(False) # Calculate the learning rate schedule. num_batches_per_epoch = (num_examples_per_epoch / FLAGS.batch_size / FLAGS.num_gpus) decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.init_lr, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()) as scope: for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. inputs = [ tower_img_embedding[i], tower_context_length[i], tower_caption_length[i], tower_context_id[i], tower_caption_id[i], tower_answer_id[i], tower_context_mask[i], tower_caption_mask[i], ] # print('loss before') loss = _tower_loss(inputs, scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # print('reuse after') # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # print('summaries after') # Calculate the gradients for the batch of data on this CIFAR tower. # Returns: # A list of (gradient, variable) pairs. Variable is always present, but gradient can be None. grads = opt.compute_gradients(loss) # print('grads after') # Keep track of the gradients across all towers. tower_grads.append(grads) # print('tower_grads') # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) clipped_grads_and_vars = [(tf.clip_by_norm(gv[0], \ FLAGS.max_grad_norm), gv[1]) for gv in grads] # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(clipped_grads_and_vars, global_step=global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() print('initing...') sess.run(init) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint print('Restoring from checkpoint...') saver.restore(sess, ckpt.model_checkpoint_path) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([apply_gradient_op, loss]) # print('step and loss : ', step, loss_value) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if (step + 1) % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') c_g_step = int(global_step.eval(session=sess)) print(format_str % (datetime.now(), c_g_step, loss_value, examples_per_sec, sec_per_batch)) if (step + 1) % 25 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, c_g_step) # Save the model checkpoint periodically. if (step + 1) % 500 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=c_g_step)