def resnet_main(flags, model_function): """ The function for training the network using the given model function. :param flags: the arguments given from the command line :param model_function: the model to run """ # replicate the model function for multiple gpus if flags.multi_gpu: validate_batch_size_multi_gpu(flags.batch_size) model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # initialize the estimator classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'multi_gpu': flags.multi_gpu }) print("Starting a training cycle") # train the network classifier.train(input_fn=lambda: input_data.get_input_data( flags.batch_size, flags.shuffle_buffer, True, input_data. parse_record_fn, flags.train_epochs)) print("Starting evaluation") # evaluate the network eval_results = classifier.evaluate( input_fn=lambda: input_data.get_input_data( flags.batch_size, flags.shuffle_buffer, False, input_data. parse_record_fn, flags.epochs_per_eval)) print(eval_results)
def run_training(restore_chkpt=None): global net net = imp.load_source('net', FLAGS.net_module) with tf.Graph().as_default(), tf.device('/cpu:0'): train_phase = tf.Variable(True, trainable=False, name='train_phase', dtype=tf.bool, collections=[]) inp_data = input_data.get_input_data(FLAGS) t_image, t_label = inp_data['train']['image_input'], inp_data['train']['label_input'] t_image = net.aug_train(t_image, inp_data['aux']) v_image, v_label = inp_data['validation']['image_input'], inp_data['validation']['label_input'] v_image = net.aug_eval(v_image, inp_data['aux']) v_images, v_labels = batch(v_image, v_label, FLAGS.batch_size * FLAGS.num_gpus, 'eval_batch') v_images_split = tf.split(v_images, FLAGS.num_gpus) v_labels_split = tf.split(v_labels, FLAGS.num_gpus) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) epoch_steps = inp_data['train']['images'].shape[0] / (FLAGS.batch_size) decay_steps = int(net.opts['num_epochs_per_decay'] * epoch_steps) lr = tf.train.exponential_decay(net.opts['initial_learning_rate'], global_step, decay_steps, net.opts['learning_rate_decay_factor'], staircase=True) opt = tf.train.MomentumOptimizer(lr, 0.9) tower_grads = [] tower_evals = [] tower_losses = [] cpu_variables = FLAGS.num_gpus > 1 for i in range(FLAGS.num_gpus): reuse = i > 0 with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: t_images, t_labels = batch(t_image, t_label, FLAGS.batch_size, 'train_batch') images, labels = tf.cond(train_phase, lambda: (t_images, t_labels), lambda: (v_images_split[i], v_labels_split[i])) loss, evaluation = tower_loss_and_eval(images, labels, train_phase, reuse, cpu_variables) tower_losses.append(loss) tower_evals.append(evaluation) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) grads = opt.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) summaries.append(tf.summary.scalar('learning_rate', lr)) for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram('gradients/' + var.op.name, grad)) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): normalize_gs = global_step.assign_add(FLAGS.num_gpus - 1) for var in tf.trainable_variables(): summaries.append(tf.summary.histogram('variables/' + var.op.name, var)) train_loss = tf.Variable(5.0, trainable=False, name='train_loss', dtype=tf.float32) train_precision = tf.Variable(0.0, trainable=False, name='train_precision', dtype=tf.float32) train_lp_decay = 0.9 train_lp_updates = [] for i in range(FLAGS.num_gpus): train_lp_updates.append(train_loss.assign_sub((1.0 - train_lp_decay) * (train_loss - tower_losses[i]))) new_precision = tf.reduce_mean(tf.cast(tower_evals[i], tf.float32)) train_lp_updates.append(train_precision.assign_sub((1.0 - train_lp_decay) * (train_precision - new_precision))) train_lp_update = tf.group(*train_lp_updates) summaries.append(tf.summary.scalar('loss/train', train_loss)) summaries.append(tf.summary.scalar('precision/train', train_precision)) validation_loss = tf.Variable(0.0, trainable=False, dtype=tf.float32) validation_precision = tf.Variable(0.0, trainable=False, dtype=tf.float32) assign_ph = tf.placeholder(tf.float32, shape=[]) vl_assign_op = validation_loss.assign(assign_ph) vp_assign_op = validation_precision.assign(assign_ph) summaries.append(tf.summary.scalar('loss/validation', validation_loss)) summaries.append(tf.summary.scalar('precision/validation', validation_precision)) variable_averages = tf.train.ExponentialMovingAverage(0.999, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) train_op = tf.group(apply_gradients_op, normalize_gs, variables_averages_op, train_lp_update) qrunners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) for qr in qrunners: summaries.append(tf.summary.scalar('queues/size/' + qr.name, qr.queue.size())) saver = tf.train.Saver(tf.all_variables()) ema_saver = tf.train.Saver(variable_averages.variables_to_restore()) summary_op = tf.summary.merge(summaries) init = tf.initialize_all_variables() switch_train = train_phase.assign(True) switch_eval = train_phase.assign(False) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) #initialize const variables with dataset sess.run(inp_data['initializer'], feed_dict=inp_data['init_feed']) sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) all_vars = tf.all_variables() pretrained_re = ['.*conv.*'] + ['.*bn%d.*' % x for x in range(1, 4)] is_pretrained = lambda x: np.any([re.search(pattern, x.name) is not None for pattern in pretrained_re]) pretrained_vars = list(filter(is_pretrained, all_vars)) pretrained_dict = {} for x in pretrained_vars: nm = re.sub('tower_\d*', 'tower_0', x.op.name) pretrained_dict[nm] = x print('Using pretrained variables') print('\n'.join(['%s --> %s' % (v[0], v[1].name) for v in pretrained_dict.items()])) pretrained_saver = tf.train.Saver(pretrained_dict) pretrained_saver.restore(sess, FLAGS.pretrained_ckpt) sys.stdout.write('\n\n') epoch_steps = int(inp_data['train']['images'].shape[0] / FLAGS.batch_size + 0.5) start_epoch = 0 if restore_chkpt is not None: saver.restore(sess, restore_chkpt) sys.stdout.write('Previously started training session restored from "%s".\n' % restore_chkpt) start_epoch = int(sess.run(global_step)) // epoch_steps sys.stdout.write('Starting with epoch #%d.\n' % (start_epoch + 1)) for epoch in range(start_epoch, FLAGS.max_epochs): sys.stdout.write('\n') _ = sess.run(switch_train) sys.stdout.write('Epoch #%d. [Train]\n' % (epoch + 1)) sys.stdout.flush() cum_t = 0.0 step = 0 log_steps = FLAGS.log_steps fmt_str = 'Epoch #%d [%s]. Step %d/%d (%d%%). Speed = %.2f sec/b, %.2f img/sec. Batch_loss = %.2f. Batch_precision = %.2f' while step < epoch_steps: start_time = time.time() _, loss_value, eval_value = sess.run([train_op, loss, evaluation]) duration = time.time() - start_time step += FLAGS.num_gpus assert not np.isnan(loss_value), 'Model diverged with loss = NaN' cum_t += duration sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration if cum_t > 2.0: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Train', step + 1, epoch_steps, int(100.0 * (step + 1) / epoch_steps), sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0 )) sys.stdout.flush() log_steps -= FLAGS.num_gpus if (log_steps < 0): log_steps = FLAGS.log_steps summary_str = sess.run(summary_op) glob_step = epoch * epoch_steps + step summary_writer.add_summary(summary_str, glob_step) sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Train', epoch_steps, epoch_steps, 100, sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0 )) sys.stdout.write('\n') train_loss_val, train_precision_val = sess.run([train_loss, train_precision]) sys.stdout.write('Epoch #%d. Train loss = %.2f. Train precision = %.2f.\n' % (epoch + 1, train_loss_val, train_precision_val * 100.0)) checkpoint_path = os.path.join(FLAGS.log_dir, 'model.ckpt') chkpt = saver.save(sess, checkpoint_path, global_step=global_step) sys.stdout.write('Checkpoint "%s" saved.\n\n' % chkpt) #Evaluation phase sess.run(switch_eval) sys.stdout.write('Epoch #%d. [Evaluation]\n' % (epoch + 1)) ema_saver.restore(sess, chkpt) sys.stdout.write('EMA variables restored.\n') eval_cnt = inp_data['validation']['images'].shape[0] eval_steps = (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size eval_correct = 0 eval_loss = 0.0 cum_t = 0.0 while eval_cnt > 0: start_time = time.time() eval_values_and_losses = sess.run(tower_evals + tower_losses) duration = time.time() - start_time eval_values = eval_values_and_losses[:FLAGS.num_gpus] eval_values = np.concatenate(eval_values, axis=0) eval_losses = eval_values_and_losses[-FLAGS.num_gpus:] cnt = min(eval_values.shape[0], eval_cnt) eval_correct += np.sum(eval_values[:cnt]) eval_loss += np.sum(eval_losses) * FLAGS.batch_size eval_cnt -= cnt cur_step = eval_steps - (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration cum_t += duration if cum_t > 0.5: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Evaluation', cur_step, eval_steps, int(100.0 * cur_step / eval_steps), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0 )) sys.stdout.flush() sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Evaluation', eval_steps, eval_steps, int(100.0), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0 )) sys.stdout.write('\n') sys.stdout.flush() eval_precision = eval_correct / inp_data['validation']['images'].shape[0] eval_loss = eval_loss / inp_data['validation']['images'].shape[0] sys.stdout.write('Epoch #%d. Validation loss = %.2f. Validation precision = %.2f.\n' % (epoch + 1, eval_loss, eval_precision * 100.0)) saver.restore(sess, chkpt) sys.stdout.write('Variables restored.\n\n') sess.run(vl_assign_op, feed_dict={assign_ph: eval_loss}) sess.run(vp_assign_op, feed_dict={assign_ph: eval_precision}) if sys.version_info[0] < 3: w = 80 else: w = os.get_terminal_size().columns sys.stdout.write(('=' * w + '\n') * 2) coord.request_stop() coord.join(threads)
def run_training(restore_chkpt=None): global net net = imp.load_source('net', FLAGS.net_module) with tf.Graph().as_default(), tf.device('/cpu:0'): train_phase = tf.Variable(True, trainable=False, name='train_phase', dtype=tf.bool, collections=[]) inp_data = input_data.get_input_data(FLAGS) t_image, t_label = inp_data['train']['image_input'], inp_data['train'][ 'label_input'] t_image = net.aug_train(t_image, inp_data['aux']) v_image, v_label = inp_data['validation']['image_input'], inp_data[ 'validation']['label_input'] v_image = net.aug_eval(v_image, inp_data['aux']) v_images, v_labels = batch(v_image, v_label, FLAGS.batch_size * FLAGS.num_gpus, 'eval_batch') v_images_split = tf.split(v_images, FLAGS.num_gpus) v_labels_split = tf.split(v_labels, FLAGS.num_gpus) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) epoch_steps = inp_data['train']['images'].shape[0] / (FLAGS.batch_size) decay_steps = int(net.opts['num_epochs_per_decay'] * epoch_steps) lr = tf.train.exponential_decay(net.opts['initial_learning_rate'], global_step, decay_steps, net.opts['learning_rate_decay_factor'], staircase=True) opt = tf.train.MomentumOptimizer(lr, 0.9) tower_grads = [] tower_evals = [] tower_losses = [] cpu_variables = FLAGS.num_gpus > 1 for i in range(FLAGS.num_gpus): reuse = i > 0 with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: t_images, t_labels = batch(t_image, t_label, FLAGS.batch_size, 'train_batch') images, labels = tf.cond( train_phase, lambda: (t_images, t_labels), lambda: (v_images_split[i], v_labels_split[i])) loss, evaluation = tower_loss_and_eval( images, labels, train_phase, reuse, cpu_variables) tower_losses.append(loss) tower_evals.append(evaluation) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) grads = opt.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) summaries.append(tf.summary.scalar('learning_rate', lr)) for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram('gradients/' + var.op.name, grad)) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): normalize_gs = global_step.assign_add(FLAGS.num_gpus - 1) for var in tf.trainable_variables(): summaries.append( tf.summary.histogram('variables/' + var.op.name, var)) train_loss = tf.Variable(5.0, trainable=False, name='train_loss', dtype=tf.float32) train_precision = tf.Variable(0.0, trainable=False, name='train_precision', dtype=tf.float32) train_lp_decay = 0.9 train_lp_updates = [] for i in range(FLAGS.num_gpus): train_lp_updates.append( train_loss.assign_sub( (1.0 - train_lp_decay) * (train_loss - tower_losses[i]))) new_precision = tf.reduce_mean(tf.cast(tower_evals[i], tf.float32)) train_lp_updates.append( train_precision.assign_sub((1.0 - train_lp_decay) * (train_precision - new_precision))) train_lp_update = tf.group(*train_lp_updates) summaries.append(tf.summary.scalar('loss/train', train_loss)) summaries.append(tf.summary.scalar('precision/train', train_precision)) validation_loss = tf.Variable(0.0, trainable=False, dtype=tf.float32) validation_precision = tf.Variable(0.0, trainable=False, dtype=tf.float32) assign_ph = tf.placeholder(tf.float32, shape=[]) vl_assign_op = validation_loss.assign(assign_ph) vp_assign_op = validation_precision.assign(assign_ph) summaries.append(tf.summary.scalar('loss/validation', validation_loss)) summaries.append( tf.summary.scalar('precision/validation', validation_precision)) variable_averages = tf.train.ExponentialMovingAverage( 0.999, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradients_op, normalize_gs, variables_averages_op, train_lp_update) qrunners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) for qr in qrunners: summaries.append( tf.summary.scalar('queues/size/' + qr.name, qr.queue.size())) saver = tf.train.Saver(tf.all_variables()) ema_saver = tf.train.Saver(variable_averages.variables_to_restore()) summary_op = tf.summary.merge(summaries) init = tf.initialize_all_variables() switch_train = train_phase.assign(True) switch_eval = train_phase.assign(False) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) #initialize const variables with dataset sess.run(inp_data['initializer'], feed_dict=inp_data['init_feed']) sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) all_vars = tf.all_variables() pretrained_re = ['.*conv.*'] + ['.*bn%d.*' % x for x in range(1, 4)] is_pretrained = lambda x: np.any([ re.search(pattern, x.name) is not None for pattern in pretrained_re ]) pretrained_vars = list(filter(is_pretrained, all_vars)) pretrained_dict = {} for x in pretrained_vars: nm = re.sub('tower_\d*', 'tower_0', x.op.name) pretrained_dict[nm] = x print('Using pretrained variables') print('\n'.join([ '%s --> %s' % (v[0], v[1].name) for v in pretrained_dict.items() ])) pretrained_saver = tf.train.Saver(pretrained_dict) pretrained_saver.restore(sess, FLAGS.pretrained_ckpt) sys.stdout.write('\n\n') epoch_steps = int(inp_data['train']['images'].shape[0] / FLAGS.batch_size + 0.5) start_epoch = 0 if restore_chkpt is not None: saver.restore(sess, restore_chkpt) sys.stdout.write( 'Previously started training session restored from "%s".\n' % restore_chkpt) start_epoch = int(sess.run(global_step)) // epoch_steps sys.stdout.write('Starting with epoch #%d.\n' % (start_epoch + 1)) for epoch in range(start_epoch, FLAGS.max_epochs): sys.stdout.write('\n') _ = sess.run(switch_train) sys.stdout.write('Epoch #%d. [Train]\n' % (epoch + 1)) sys.stdout.flush() cum_t = 0.0 step = 0 log_steps = FLAGS.log_steps fmt_str = 'Epoch #%d [%s]. Step %d/%d (%d%%). Speed = %.2f sec/b, %.2f img/sec. Batch_loss = %.2f. Batch_precision = %.2f' while step < epoch_steps: start_time = time.time() _, loss_value, eval_value = sess.run( [train_op, loss, evaluation]) duration = time.time() - start_time step += FLAGS.num_gpus assert not np.isnan( loss_value), 'Model diverged with loss = NaN' cum_t += duration sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration if cum_t > 2.0: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Train', step + 1, epoch_steps, int(100.0 * (step + 1) / epoch_steps), sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0)) sys.stdout.flush() log_steps -= FLAGS.num_gpus if (log_steps < 0): log_steps = FLAGS.log_steps summary_str = sess.run(summary_op) glob_step = epoch * epoch_steps + step summary_writer.add_summary(summary_str, glob_step) sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Train', epoch_steps, epoch_steps, 100, sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0)) sys.stdout.write('\n') train_loss_val, train_precision_val = sess.run( [train_loss, train_precision]) sys.stdout.write( 'Epoch #%d. Train loss = %.2f. Train precision = %.2f.\n' % (epoch + 1, train_loss_val, train_precision_val * 100.0)) checkpoint_path = os.path.join(FLAGS.log_dir, 'model.ckpt') chkpt = saver.save(sess, checkpoint_path, global_step=global_step) sys.stdout.write('Checkpoint "%s" saved.\n\n' % chkpt) #Evaluation phase sess.run(switch_eval) sys.stdout.write('Epoch #%d. [Evaluation]\n' % (epoch + 1)) ema_saver.restore(sess, chkpt) sys.stdout.write('EMA variables restored.\n') eval_cnt = inp_data['validation']['images'].shape[0] eval_steps = (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size eval_correct = 0 eval_loss = 0.0 cum_t = 0.0 while eval_cnt > 0: start_time = time.time() eval_values_and_losses = sess.run(tower_evals + tower_losses) duration = time.time() - start_time eval_values = eval_values_and_losses[:FLAGS.num_gpus] eval_values = np.concatenate(eval_values, axis=0) eval_losses = eval_values_and_losses[-FLAGS.num_gpus:] cnt = min(eval_values.shape[0], eval_cnt) eval_correct += np.sum(eval_values[:cnt]) eval_loss += np.sum(eval_losses) * FLAGS.batch_size eval_cnt -= cnt cur_step = eval_steps - (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration cum_t += duration if cum_t > 0.5: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Evaluation', cur_step, eval_steps, int(100.0 * cur_step / eval_steps), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0)) sys.stdout.flush() sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Evaluation', eval_steps, eval_steps, int(100.0), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0)) sys.stdout.write('\n') sys.stdout.flush() eval_precision = eval_correct / inp_data['validation'][ 'images'].shape[0] eval_loss = eval_loss / inp_data['validation']['images'].shape[0] sys.stdout.write( 'Epoch #%d. Validation loss = %.2f. Validation precision = %.2f.\n' % (epoch + 1, eval_loss, eval_precision * 100.0)) saver.restore(sess, chkpt) sys.stdout.write('Variables restored.\n\n') sess.run(vl_assign_op, feed_dict={assign_ph: eval_loss}) sess.run(vp_assign_op, feed_dict={assign_ph: eval_precision}) if sys.version_info[0] < 3: w = 80 else: w = os.get_terminal_size().columns sys.stdout.write(('=' * w + '\n') * 2) coord.request_stop() coord.join(threads)
def run_training(restore_chkpt=None): global net net = imp.load_source('net', FLAGS.net_module) with tf.Graph().as_default(), tf.device('/cpu:0'): train_phase = tf.Variable(True, trainable=False, name='train_phase', dtype=tf.bool, collections=[]) inp_data = input_data.get_input_data(FLAGS) t_image, t_label = inp_data['train']['image_input'], inp_data['train'][ 'label_input'] t_image = input_data.aug_train(t_image, inp_data['aux']) v_image, v_label = inp_data['validation']['image_input'], inp_data[ 'validation']['label_input'] v_image = input_data.aug_eval(v_image, inp_data['aux']) v_images, v_labels = batch(v_image, v_label, FLAGS.batch_size * FLAGS.num_gpus, 'eval_batch') v_images_split = tf.split(v_images, FLAGS.num_gpus) v_labels_split = tf.split(v_labels, FLAGS.num_gpus) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) epoch_steps = inp_data['train']['images'].shape[0] / (FLAGS.batch_size) decay_steps = int(FLAGS.num_epochs_per_decay * epoch_steps) lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # boundaries = [int(epoch_steps * epoch) for epoch in learning_rate_decay_boundary] # values = [FLAGS.initial_learning_rate*decay for decay in learning_rate_decay_value] # lr = tf.train.piecewise_constant(tf.cast(global_step, tf.int32), boundaries, values) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=FLAGS.MOMENTUM, name='optimizer', use_nesterov=True) # opt = tf.train.AdamOptimizer(lr, name='optimizer') tower_grads = [] tower_evals = [] tower_losses = [] cpu_variables = FLAGS.num_gpus > 1 for i in range(FLAGS.num_gpus): reuse = i > 0 with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: t_images, t_labels = batch(t_image, t_label, FLAGS.batch_size, 'train_batch') images, labels = tf.cond( train_phase, lambda: (t_images, t_labels), lambda: (v_images_split[i], v_labels_split[i])) loss, evaluation = tower_loss_and_eval( images, labels, train_phase, reuse, cpu_variables) tower_losses.append(loss) tower_evals.append(evaluation) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) grads = opt.compute_gradients(loss) tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) summaries.append(tf.summary.scalar('learning_rate', lr)) for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram('gradients/' + var.op.name, grad)) # Apply the gradients to adjust the shared variables. apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): normalize_gs = global_step.assign_add(FLAGS.num_gpus - 1) for var in tf.trainable_variables(): summaries.append( tf.summary.histogram('variables/' + var.op.name, var)) train_loss = tf.Variable(5.0, trainable=False, name='train_loss', dtype=tf.float32) train_precision = tf.Variable(0.0, trainable=False, name='train_precision', dtype=tf.float32) train_lp_decay = 0.9 train_lp_updates = [] for i in range(FLAGS.num_gpus): train_lp_updates.append( train_loss.assign_sub( (1.0 - train_lp_decay) * (train_loss - tower_losses[i]))) new_precision = tf.reduce_mean(tf.cast(tower_evals[i], tf.float32)) train_lp_updates.append( train_precision.assign_sub((1.0 - train_lp_decay) * (train_precision - new_precision))) train_lp_update = tf.group(*train_lp_updates) summaries.append(tf.summary.scalar('loss/train', train_loss)) summaries.append(tf.summary.scalar('precision/train', train_precision)) validation_loss = tf.Variable(0.0, trainable=False, dtype=tf.float32) validation_precision = tf.Variable(0.0, trainable=False, dtype=tf.float32) assign_ph = tf.placeholder(tf.float32, shape=[]) vl_assign_op = validation_loss.assign(assign_ph) vp_assign_op = validation_precision.assign(assign_ph) summaries.append(tf.summary.scalar('loss/validation', validation_loss)) summaries.append( tf.summary.scalar('precision/validation', validation_precision)) variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step, zero_debias=True) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradients_op, normalize_gs, variables_averages_op, train_lp_update) qrunners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) for qr in qrunners: summaries.append( tf.summary.scalar('queues/size/' + qr.name, qr.queue.size())) saver = tf.train.Saver(tf.global_variables()) ema_saver = tf.train.Saver(variable_averages.variables_to_restore()) summary_op = tf.summary.merge(summaries) init = tf.global_variables_initializer() switch_train = train_phase.assign(True) switch_eval = train_phase.assign(False) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) #initialize const variables with dataset sess.run(inp_data['initializer'], feed_dict=inp_data['init_feed']) sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) sys.stdout.write('\n\n') epoch_steps = int(inp_data['train']['images'].shape[0] / FLAGS.batch_size + 0.5) start_epoch = 0 if restore_chkpt is not None: saver.restore(sess, restore_chkpt) sys.stdout.write( 'Previously started training session restored from "%s".\n' % restore_chkpt) start_epoch = int(sess.run(global_step)) // epoch_steps print_hyper_parameters() sys.stdout.write('Starting with epoch #%d.\n' % (start_epoch + 1)) bestValidationPrecision = 0.0 for epoch in range(start_epoch, FLAGS.max_epochs): sys.stdout.write('\n') _ = sess.run(switch_train) lr_val = sess.run(opt._learning_rate) sys.stdout.write('Epoch #%d. [Train], learning rate: %.2e\n' % (epoch + 1, lr_val)) sys.stdout.flush() cum_t = 0.0 step = 0 log_steps = FLAGS.log_steps fmt_str = 'Epoch #%d [%s]. Step %d/%d (%d%%). Speed = %.2f sec/b, %.2f img/sec. Batch_loss = %.3f. Batch_precision = %.3f' while step < epoch_steps: start_time = time.time() _, loss_value, eval_value = sess.run( [train_op, loss, evaluation]) duration = time.time() - start_time step += FLAGS.num_gpus assert not np.isnan( loss_value), 'Model diverged with loss = NaN' cum_t += duration sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration if cum_t > 2.0: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Train', step + 1, epoch_steps, int(100.0 * (step + 1) / epoch_steps), sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0)) sys.stdout.flush() log_steps -= FLAGS.num_gpus if (log_steps < 0): log_steps = FLAGS.log_steps summary_str = sess.run(summary_op) glob_step = epoch * epoch_steps + step summary_writer.add_summary(summary_str, glob_step) sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Train', epoch_steps, epoch_steps, 100, sec_per_batch, img_per_sec, loss_value, np.mean(eval_value) * 100.0)) sys.stdout.write('\n') train_loss_val, train_precision_val = sess.run( [train_loss, train_precision]) sys.stdout.write( 'Epoch #%d. Train loss = %.3f. Train precision = %.3f.\n' % (epoch + 1, train_loss_val, train_precision_val * 100.0)) checkpoint_path = os.path.join(FLAGS.log_dir, 'model.ckpt') chkpt = saver.save(sess, checkpoint_path, global_step=global_step) sys.stdout.write('Checkpoint "%s" saved.\n\n' % chkpt) #Evaluation phase sess.run(switch_eval) sys.stdout.write('Epoch #%d. [Evaluation]\n' % (epoch + 1)) ema_saver.restore(sess, chkpt) sys.stdout.write('EMA variables restored.\n') eval_cnt = inp_data['validation']['images'].shape[0] eval_steps = (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size eval_correct = 0 eval_loss = 0.0 cum_t = 0.0 while eval_cnt > 0: start_time = time.time() eval_values_and_losses = sess.run(tower_evals + tower_losses) duration = time.time() - start_time eval_values = eval_values_and_losses[:FLAGS.num_gpus] eval_values = np.concatenate(eval_values, axis=0) eval_losses = eval_values_and_losses[-FLAGS.num_gpus:] cnt = min(eval_values.shape[0], eval_cnt) eval_correct += np.sum(eval_values[:cnt]) eval_loss += np.sum(eval_losses) * FLAGS.batch_size eval_cnt -= cnt cur_step = eval_steps - (eval_cnt + FLAGS.batch_size - 1) // FLAGS.batch_size sec_per_batch = duration / FLAGS.num_gpus img_per_sec = FLAGS.num_gpus * FLAGS.batch_size / duration cum_t += duration if cum_t > 0.5: cum_t = 0.0 sys.stdout.write('\r') sys.stdout.write( fmt_str % (epoch + 1, 'Evaluation', cur_step, eval_steps, int(100.0 * cur_step / eval_steps), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0)) sys.stdout.flush() sys.stdout.write('\r') sys.stdout.write(fmt_str % (epoch + 1, 'Evaluation', eval_steps, eval_steps, int(100.0), sec_per_batch, img_per_sec, eval_losses[-1], np.mean(eval_values) * 100.0)) sys.stdout.write('\n') sys.stdout.flush() eval_precision = eval_correct / inp_data['validation'][ 'images'].shape[0] eval_loss = eval_loss / inp_data['validation']['images'].shape[0] if eval_precision > bestValidationPrecision: bestValidationPrecision = eval_precision sys.stdout.write( 'Epoch #%d. Validation loss = %.3f. Validation precision = %.3f. ' 'Best precision = %.3f\n' % (epoch + 1, eval_loss, eval_precision * 100.0, bestValidationPrecision * 100)) saver.restore(sess, chkpt) sys.stdout.write('Variables restored.\n\n') sess.run(vl_assign_op, feed_dict={assign_ph: eval_loss}) sess.run(vp_assign_op, feed_dict={assign_ph: eval_precision}) # w = os.get_terminal_size().columns w = 40 sys.stdout.write(('=' * w + '\n') * 2) bestFile = open(os.path.join(FLAGS.log_dir, 'best.txt'), 'w') bestFile.write('Best precision = %.4f\n' % bestValidationPrecision) bestFile.close() coord.request_stop() coord.join(threads)