def loss_and_accuracy_per_gpu(phase_train, scope='gpu_i'): # train/test inputs train_image_batch, train_label_batch = m.make_train_batch( FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch( FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond( phase_train, lambda: (train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss m.loss(logits, label_batch) loss = tf.add_n(tf.get_collection('losses', scope), name='total_loss') accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss/' + scope, loss) tf.scalar_summary('train_accuracy/' + scope, accuracy) return loss, accuracy, logits
def loss_and_accuracy_per_gpu(phase_train, scope='gpu_i'): # train/test inputs train_image_batch, train_label_batch = m.make_train_batch( FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch( FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond(phase_train, lambda: ( train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net( image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss m.loss(logits, label_batch) loss = tf.add_n(tf.get_collection('losses', scope), name='total_loss') accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss/' + scope, loss) tf.scalar_summary('train_accuracy/' + scope, accuracy) return loss, accuracy, logits
def train_and_val(): with tf.Graph().as_default(): # train/test phase indicator phase_train = tf.placeholder(tf.bool, name='phase_train') # learning rate is manually set learning_rate = tf.placeholder(tf.float32, name='learning_rate') # global step global_step = tf.Variable(0, trainable=False, name='global_step') # train/test inputs train_image_batch, train_label_batch = m.make_train_batch(FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch(FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond(phase_train, lambda: (train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss loss = m.loss(logits, label_batch) accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss', loss) tf.scalar_summary('train_accuracy', accuracy) # train one step train_op = m.train_op(loss, global_step, learning_rate) # saver saver = tf.train.Saver(tf.all_variables()) # start session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) # summary summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph_def=sess.graph_def) for var in tf.trainable_variables(): tf.histogram_summary('params/' + var.op.name, var) # initialization (TODO: or load) init_op = tf.initialize_all_variables() print('Initializing...') sess.run(init_op, {phase_train.name: True}) # train loop tf.train.start_queue_runners(sess=sess) curr_lr = 0.0 lr_scale = 1.0 for step in xrange(FLAGS.max_steps): # set learning rate manually if step <= 32000: _lr = lr_scale * 1e-1 elif step <= 48000: _lr = lr_scale * 1e-2 else: _lr = lr_scale * 1e-3 if curr_lr != _lr: curr_lr = _lr print('Learning rate set to %f' % curr_lr) fetches = [train_op, loss] if step % FLAGS.summary_interval == 0: fetches += [accuracy, summary_op] sess_outputs = sess.run(fetches, {phase_train.name: True, learning_rate.name: curr_lr}) if step % FLAGS.summary_interval == 0: train_loss_value, train_acc_value, summary_str = sess_outputs[1:] print('[%s] Iteration %d, train loss = %f, train accuracy = %f' % (datetime.now(), step, train_loss_value, train_acc_value)) summary_writer.add_summary(summary_str, step) if step > 0 and step % FLAGS.val_interval == 0: print('Evaluating...') n_val_samples = 10000 val_batch_size = FLAGS.val_batch_size n_val_batch = int(n_val_samples / val_batch_size) val_logits = np.zeros((n_val_samples, 10), dtype=np.float32) val_labels = np.zeros((n_val_samples), dtype=np.int64) val_losses = [] for i in xrange(n_val_batch): fetches = [logits, label_batch, loss] session_outputs = sess.run(fetches, {phase_train.name: False}) val_logits[i*val_batch_size:(i+1)*val_batch_size,:] = session_outputs[0] val_labels[i*val_batch_size:(i+1)*val_batch_size] = session_outputs[1] val_losses.append(session_outputs[2]) pred_labels = np.argmax(val_logits, axis=1) val_accuracy = np.count_nonzero(pred_labels == val_labels) / n_val_samples val_loss = float(np.mean(np.asarray(val_losses))) print('Test accuracy = %f' % val_accuracy) val_summary = tf.Summary() val_summary.value.add(tag='val_accuracy', simple_value=val_accuracy) val_summary.value.add(tag='val_loss', simple_value=val_loss) summary_writer.add_summary(val_summary, step) if step > 0 and step % FLAGS.save_interval == 0: checkpoint_path = os.path.join(FLAGS.log_dir, 'checkpoint') saver.save(sess, checkpoint_path, global_step=step) print('Checkpoint saved at %s' % checkpoint_path)
def train_and_val(): with tf.Graph().as_default(): # train/test phase indicator phase_train = tf.placeholder(tf.bool, name='phase_train') # learning rate is manually set learning_rate = tf.placeholder(tf.float32, name='learning_rate') tf.scalar_summary('learning_rate', learning_rate) # global step global_step = tf.Variable(0, trainable=False, name='global_step') # train/test inputs train_image_batch, train_label_batch = m.make_train_batch( FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch( FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond( phase_train, lambda: (train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss loss = m.loss(logits, label_batch) accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss', loss) tf.scalar_summary('train_accuracy', accuracy) # train one step train_op = m.train_op(loss, global_step, learning_rate) # saver saver = tf.train.Saver(tf.all_variables()) # start session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) # summary writer summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph=sess.graph) # initialize parameters or load from a checkpoint if FLAGS.load_dir != '': # load from checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.load_dir) model_checkpoint_path = checkpoint.model_checkpoint_path if checkpoint and model_checkpoint_path: saver.restore(sess, model_checkpoint_path) print('Model restored from %s' % model_checkpoint_path) else: raise 'Load directory provided by no checkpoint found' else: init_op = tf.initialize_all_variables() print('Initializing...') sess.run(init_op, {phase_train.name: True}) print('Start training...') # train loop tf.train.start_queue_runners(sess=sess) curr_lr = 0.0 for step in xrange(FLAGS.max_steps): # # set learning rate manually # if step <= 5000: # _lr = 1e-2 # elif step <= 32000: # _lr = 1e-1 # elif step <= 48000: # _lr = 1e-2 # else: # _lr = 1e-3 # set learning rate manually if step <= 48000: _lr = 1e-2 else: _lr = 1e-3 if curr_lr != _lr: curr_lr = _lr print('Learning rate set to %f' % curr_lr) # train fetches = [train_op, loss] if step > 0 and step % FLAGS.summary_interval == 0: fetches += [accuracy, summary_op] sess_outputs = sess.run(fetches, { phase_train.name: True, learning_rate.name: curr_lr }) # summary if step > 0 and step % FLAGS.summary_interval == 0: train_loss_value, train_acc_value, summary_str = sess_outputs[ 1:] print( '[%s] Iteration %d, train loss = %f, train accuracy = %f' % (datetime.now(), step, train_loss_value, train_acc_value)) summary_writer.add_summary(summary_str, step) # validation if step > 0 and step % FLAGS.val_interval == 0: print('Evaluating...') n_val_samples = 10000 val_batch_size = FLAGS.val_batch_size n_val_batch = int(n_val_samples / val_batch_size) val_logits = np.zeros((n_val_samples, 10), dtype=np.float32) val_labels = np.zeros((n_val_samples), dtype=np.int64) val_losses = [] for i in xrange(n_val_batch): fetches = [logits, label_batch, loss] session_outputs = sess.run(fetches, {phase_train.name: False}) val_logits[i * val_batch_size:(i + 1) * val_batch_size, :] = session_outputs[0] val_labels[i * val_batch_size:(i + 1) * val_batch_size] = session_outputs[1] val_losses.append(session_outputs[2]) pred_labels = np.argmax(val_logits, axis=1) val_accuracy = np.count_nonzero( pred_labels == val_labels) / n_val_samples val_loss = float(np.mean(np.asarray(val_losses))) print('Test accuracy = %f' % val_accuracy) val_summary = tf.Summary() val_summary.value.add(tag='val_accuracy', simple_value=val_accuracy) val_summary.value.add(tag='val_loss', simple_value=val_loss) summary_writer.add_summary(val_summary, step) # save variables if step > 0 and step % FLAGS.save_interval == 0: checkpoint_path = os.path.join(FLAGS.log_dir, 'checkpoint') saver.save(sess, checkpoint_path, global_step=step) print('Checkpoint saved at %s' % checkpoint_path)
def train_and_val(): with tf.Graph().as_default(): # train/test phase indicator phase_train = tf.placeholder(tf.bool, name='phase_train') # learning rate is manually set learning_rate = tf.placeholder(tf.float32, name='learning_rate') # global step global_step = tf.Variable(0, trainable=False, name='global_step') # train/test inputs train_image_batch, train_label_batch = m.make_train_batch( FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch( FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond( phase_train, lambda: (train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss m.loss(logits, label_batch) loss = tf.add_n(tf.get_collection('losses'), name='total_loss') m.summary_losses() accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss', loss) tf.scalar_summary('train_accuracy', accuracy) # saver saver = tf.train.Saver(tf.all_variables()) # start session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) # summary for var in tf.trainable_variables(): tf.histogram_summary('params/' + var.op.name, var) init_op = tf.initialize_all_variables() if FLAGS.restore_path is None: # initialization print('Initializing...') sess.run(init_op, {phase_train.name: True}) else: # restore from previous checkpoint sess.run(init_op, {phase_train.name: True}) print('Restore variable from %s' % FLAGS.restore_path) saver.restore(sess, FLAGS.restore_path) # train loop tf.train.start_queue_runners(sess=sess) n_samples = 10000 batch_size = FLAGS.val_batch_size n_iter = int(np.floor(n_samples / batch_size)) accuracies = [] losses = [] for step in xrange(n_iter): fetches = [loss, accuracy] val_loss, val_acc = sess.run(fetches, {phase_train.name: False}) losses.append(val_loss) accuracies.append(val_acc) print('[%s] Iteration %d, val loss = %f, val accuracy = %f' % (datetime.now(), step, val_loss, val_acc)) val_acc = np.mean(accuracies) val_loss = np.mean(losses) print('val losses is %f, accuracy is %f' % (val_loss, val_acc))
def train_and_val(): with tf.Graph().as_default(): # train/test phase indicator phase_train = tf.placeholder(tf.bool, name='phase_train') # learning rate is manually set learning_rate = tf.placeholder(tf.float32, name='learning_rate') # global step global_step = tf.Variable(0, trainable=False, name='global_step') # train/test inputs train_image_batch, train_label_batch = m.make_train_batch( FLAGS.train_tf_path, FLAGS.train_batch_size) val_image_batch, val_label_batch = m.make_validation_batch( FLAGS.val_tf_path, FLAGS.val_batch_size) image_batch, label_batch = control_flow_ops.cond(phase_train, lambda: ( train_image_batch, train_label_batch), lambda: (val_image_batch, val_label_batch)) # model outputs logits = m.residual_net( image_batch, FLAGS.residual_net_n, 10, phase_train) # total loss m.loss(logits, label_batch) loss = tf.add_n(tf.get_collection('losses'), name='total_loss') m.summary_losses() accuracy = m.accuracy(logits, label_batch) tf.scalar_summary('train_loss', loss) tf.scalar_summary('train_accuracy', accuracy) # saver saver = tf.train.Saver(tf.all_variables()) # start session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) # summary for var in tf.trainable_variables(): tf.histogram_summary('params/' + var.op.name, var) init_op = tf.initialize_all_variables() if FLAGS.restore_path is None: # initialization print('Initializing...') sess.run(init_op, {phase_train.name: True}) else: # restore from previous checkpoint sess.run(init_op, {phase_train.name: True}) print('Restore variable from %s' % FLAGS.restore_path) saver.restore(sess, FLAGS.restore_path) # train loop tf.train.start_queue_runners(sess=sess) n_samples = 10000 batch_size = FLAGS.val_batch_size n_iter = int(np.floor(n_samples / batch_size)) accuracies = [] losses = [] for step in xrange(n_iter): fetches = [loss, accuracy] val_loss, val_acc = sess.run( fetches, {phase_train.name: False}) losses.append(val_loss) accuracies.append(val_acc) print('[%s] Iteration %d, val loss = %f, val accuracy = %f' % (datetime.now(), step, val_loss, val_acc)) val_acc = np.mean(accuracies) val_loss = np.mean(losses) print('val losses is %f, accuracy is %f' % (val_loss, val_acc))