def train(train_data_set, val_data_set, load_model_path, save_model_path, phases_names): x_ROI = tf.placeholder(tf.float32, shape=[ None, net_config.ROI_SIZE_W, net_config.ROI_SIZE_H, net_config.IMAGE_CHANNEL * len(phases_names) ], name='input_x') x_EXPAND = tf.placeholder(tf.float32, shape=[ None, net_config.EXPAND_SIZE_W, net_config.EXPAND_SIZE_H, net_config.IMAGE_CHANNEL * len(phases_names) ]) y_ = tf.placeholder(tf.float32, shape=[ None, ]) tf.summary.histogram('label', y_) global_step = tf.Variable(0, trainable=False) # variable_average = tf.train.ExponentialMovingAverage( # sub_Config.MOVING_AVERAGE_DECAY, # global_step # ) # vaeriable_average_op = variable_average.apply(tf.trainable_variables()) # regularizer = tf.contrib.layers.l2_regularizer(sub_Config.REGULARIZTION_RATE) is_training = tf.placeholder('bool', [], name='is_training') FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('data_dir', '/tmp/cifar-data', 'where to store the dataset') tf.app.flags.DEFINE_boolean( 'use_bn', True, 'use batch normalization. otherwise use biases') y = inference_small([x_ROI, x_EXPAND], is_training=is_training, num_classes=net_config.OUTPUT_NODE, use_bias=FLAGS.use_bn, phase_names=phases_names, num_blocks=3) tf.summary.histogram('logits', tf.argmax(y, 1)) loss_ = loss(logits=y, labels=tf.cast(y_, np.int32)) tf.summary.scalar('loss', loss_) # opt = tf.train.MomentumOptimizer(sub_Config.LEARNING_RATE, sub_Config.MOMENTUM) # grads = opt.compute_gradients(loss_) # apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # train_op = apply_gradient_op train_op = tf.train.GradientDescentOptimizer( learning_rate=net_config.LEARNING_RATE).minimize( loss=loss_, global_step=global_step) # with tf.control_dependencies([train_step, vaeriable_average_op]): # train_op = tf.no_op(name='train') with tf.variable_scope('accuracy'): accuracy_tensor = tf.reduce_mean( tf.cast(tf.equal(x=tf.argmax(y, 1), y=tf.cast(y_, tf.int64)), tf.float32)) tf.summary.scalar('accuracy', accuracy_tensor) saver = tf.train.Saver() merge_op = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if load_model_path: saver.restore(sess, load_model_path) writer = tf.summary.FileWriter('./log/fine_tuning/train', tf.get_default_graph()) val_writer = tf.summary.FileWriter('./log/fine_tuning/val', tf.get_default_graph()) for i in range(net_config.ITERATOE_NUMBER): images, images_expand, labels = train_data_set.get_next_batch( net_config.BATCH_SIZE, net_config.DISTRIBUTION) _, loss_value, accuracy_value, summary, global_step_value = sess.run( [train_op, loss_, accuracy_tensor, merge_op, global_step], feed_dict={ x_ROI: images, x_EXPAND: images_expand, y_: labels }) writer.add_summary(summary=summary, global_step=global_step_value) if net_config.OUTPUT_NODE == 2 and ( global_step_value - 1) % 100 == 0 and i != 0 and save_model_path is not None: # 保存模型 二分类每100步保存一下模型 import os save_path = os.path.join(save_model_path, str(global_step_value)) if not os.path.exists(save_path): os.mkdir(save_path) save_path += '/' print 'mode saved path is ', save_path saver.save(sess, save_path) if net_config.OUTPUT_NODE == 5 and ( global_step_value - 1) % 100 == 0 and i != 0 and save_model_path is not None: # 保存模型 五分类每500步保存一下模型 import os save_path = os.path.join(save_model_path, str(global_step_value)) if not os.path.exists(save_path): os.mkdir(save_path) save_path += '/' print 'mode saved path is ', save_path saver.save(sess, save_path) if i % 100 == 0: validation_images, validation_images_expand, validation_labels = val_data_set.get_next_batch( ) validation_accuracy, validation_loss, summary, logits = sess.run( [accuracy_tensor, loss_, merge_op, y], feed_dict={ x_ROI: validation_images, x_EXPAND: validation_images_expand, y_: validation_labels }) calculate_acc_error(logits=np.argmax(logits, 1), label=validation_labels, show=True) binary_acc = acc_binary_acc( logits=np.argmax(logits, 1), label=validation_labels, ) val_writer.add_summary(summary, global_step_value) print 'step is %d,training loss value is %g, accuracy is %g ' \ 'validation loss value is %g, accuracy is %g, binary_acc is %g' % \ (global_step_value, loss_value, accuracy_value, validation_loss, validation_accuracy, binary_acc) writer.close() val_writer.close()
def train(training_set, training_labels): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.Variable(0, name="global_step", trainable=False) # get num of examples in training set # dataset_num_examples = training_set.shape[0] # Calculate the learning rate schedule. # num_batches_per_epoch = (dataset_num_examples / FLAGS.batch_size) # decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. ''' lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) ''' lr_placeholder = tf.placeholder(dtype=tf.float32, shape=[]) # Create an optimizer that performs gradient descent. #opt = tf.train.AdamOptimizer(lr) opt = tf.train.MomentumOptimizer(lr_placeholder, MOMENTUM) #fetch the data batch from training set images, labels = cifar10.placeholder_inputs(FLAGS.batch_size) logits = resnet.inference(images, FLAGS.num_residual_blocks, reuse=False) #calc the loss and gradients loss = resnet.loss(logits, labels) regu_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n([loss] + regu_losses) grads = opt.compute_gradients(total_loss) # Apply the gradients to adjust the shared variables. apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge_all() # For testing trained model # test_size = testset.num_examples # test_images_placeholder, test_labels_placeholder = mnist.placeholder_inputs(FLAGS.batch_size) # logits_test = mnist.inference(test_images_placeholder, train=False) #pred = mnist.predictions(logits_test) validation_accuracy = tf.reduce_sum(resnet.evaluation( logits, labels)) / tf.constant(FLAGS.batch_size) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # these two parameters is used to measure when to enter next epoch local_data_batch_idx = 0 epoch_counter = 0 # Start the queue runners. tf.train.start_queue_runners(sess=sess) for step in range(FLAGS.max_steps): # change the API for new aug method epoch_counter, local_data_batch_idx, feed_dict = cifar10.fill_feed_dict( training_set, training_labels, images, labels, FLAGS.batch_size, local_data_batch_idx, epoch_counter, FLAGS.init_lr, lr_placeholder) start_time = time.time() _, loss_value, acc = sess.run( [train_op, total_loss, validation_accuracy], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.8f (%.1f examples/sec; %.3f ' 'sec/batch); acc=%.4f') tf.logging.info(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration, acc)) tf.logging.info("Data batch index: %s, Current epoch idex: %s" % (str(epoch_counter), str(local_data_batch_idx))) if step == FLAGS.decay_step0 or step == FLAGS.decay_step1: FLAGS.init_lr = 0.1 * FLAGS.init_lr if step % 2000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(train_data_set, val_data_set, load_model_path, save_model_path): x = tf.placeholder( tf.float32, shape=[ None, sub_Config.IMAGE_W, sub_Config.IMAGE_H, sub_Config.IMAGE_CHANNEL ], name='input_x' ) y_ = tf.placeholder( tf.float32, shape=[ None, ] ) tf.summary.histogram( 'label', y_ ) # global_step = tf.Variable(0, trainable=False) # variable_average = tf.train.ExponentialMovingAverage( # sub_Config.MOVING_AVERAGE_DECAY, # global_step # ) # vaeriable_average_op = variable_average.apply(tf.trainable_variables()) # regularizer = tf.contrib.layers.l2_regularizer(sub_Config.REGULARIZTION_RATE) is_training = tf.placeholder('bool', [], name='is_training') FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('data_dir', '/tmp/cifar-data', 'where to store the dataset') tf.app.flags.DEFINE_boolean('use_bn', True, 'use batch normalization. otherwise use biases') y = inference_small(x, is_training=is_training, num_classes=sub_Config.OUTPUT_NODE, use_bias=FLAGS.use_bn, num_blocks=3) tf.summary.histogram( 'logits', tf.argmax(y, 1) ) loss_ = loss( logits=y, labels=tf.cast(y_, np.int32) ) tf.summary.scalar( 'loss', loss_ ) train_op = tf.train.GradientDescentOptimizer( learning_rate=sub_Config.LEARNING_RATE ).minimize( loss=loss_, # global_step=global_step ) # with tf.control_dependencies([train_step, vaeriable_average_op]): # train_op = tf.no_op(name='train') with tf.variable_scope('accuracy'): accuracy_tensor = tf.reduce_mean( tf.cast( tf.equal(x=tf.argmax(y, 1), y=tf.cast(y_, tf.int64)), tf.float32 ) ) tf.summary.scalar( 'accuracy', accuracy_tensor ) saver = tf.train.Saver() merge_op = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if load_model_path: saver.restore(sess, load_model_path) writer = tf.summary.FileWriter('./log/fine_tuning/train', tf.get_default_graph()) val_writer = tf.summary.FileWriter('./log/fine_tuning/val', tf.get_default_graph()) for i in range(sub_Config.ITERATOE_NUMBER): images, labels = train_data_set.images, train_data_set.labels images = changed_shape(images, [ len(images), sub_Config.IMAGE_W, sub_Config.IMAGE_W, sub_Config.IMAGE_CHANNEL ]) if i == 0: from PIL import Image image = Image.fromarray(np.asarray(images[0, :, :, 0], np.uint8)) image.show() _, loss_value, accuracy_value, summary = sess.run( [train_op, loss_, accuracy_tensor, merge_op], feed_dict={ x: images, y_: labels } ) writer.add_summary( summary=summary, global_step=i ) if i % 1000 == 0 and i != 0 and save_model_path is not None: # 保存模型 saver.save(sess, save_model_path) if i % 100 == 0: validation_images, validation_labels = val_data_set.images, val_data_set.labels validation_images = changed_shape( validation_images, [ len(validation_images), sub_Config.IMAGE_W, sub_Config.IMAGE_W, 1 ] ) validation_accuracy, validation_loss, summary, logits = sess.run( [accuracy_tensor, loss_, merge_op, y], feed_dict={ x: validation_images, y_: validation_labels } ) calculate_acc_error( logits=np.argmax(logits, 1), label=validation_labels, show=True ) binary_acc = acc_binary_acc( logits=np.argmax(logits, 1), label=validation_labels, ) val_writer.add_summary(summary, i) print 'step is %d,training loss value is %g, accuracy is %g ' \ 'validation loss value is %g, accuracy is %g, binary_acc is %g' % \ (i, loss_value, accuracy_value, validation_loss, validation_accuracy, binary_acc) writer.close() val_writer.close()
def train(): global parameters config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) if device_str.find('cpu') >= 0: # cpu version num_threads = os.getenv('OMP_NUM_THREADS', 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=int(num_threads)) with tf.Graph().as_default(), tf.device(get_device_str( FLAGS.device_id)), tf.Session(config=config) as sess: images, labels = cifar10_input.inputs(False, FLAGS.data_dir, FLAGS.batch_size) print('Images: ', images) #logits = inference(images, is_training=True, num_blocks=9) logits = inference_small(images, is_training=True, num_blocks=9) # Add a simple objective so we can calculate the backward pass. loss_value = loss(logits, labels) # Compute the gradient with respect to all the parameters. lr = 0.01 #grad = tf.train.GradientDescentOptimizer(lr).minimize(loss_value) grad = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss_value) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build an initialization operation. init = tf.initialize_all_variables() # Start running operations on the Graph. sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) real_batch_size = FLAGS.batch_size num_batches_per_epoch = int( (EPOCH_SIZE + real_batch_size - 1) / real_batch_size) iterations = FLAGS.epochs * num_batches_per_epoch average_batch_time = 0.0 epochs_info = [] average_loss = 0.0 for step in xrange(iterations): start_time = time.time() _, loss_v = sess.run([grad, loss_value]) duration = time.time() - start_time average_batch_time += float(duration) average_loss += loss_v assert not np.isnan(loss_v), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), step, loss_v, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0: average_loss /= num_batches_per_epoch * FLAGS.eval_step epochs_info.append( '%d:_:%s' % (step / (FLAGS.eval_step * num_batches_per_epoch), average_loss)) average_loss = 0.0 if step == iterations - 1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print('epoch_info: %s' % ','.join(epochs_info))
def val(val_data_set, load_model_path, phases_names): x_ROI = tf.placeholder(tf.float32, shape=[ None, net_config.ROI_SIZE_W, net_config.ROI_SIZE_H, net_config.IMAGE_CHANNEL * len(phases_names) ], name='input_x') x_EXPAND = tf.placeholder(tf.float32, shape=[ None, net_config.EXPAND_SIZE_W, net_config.EXPAND_SIZE_H, net_config.IMAGE_CHANNEL * len(phases_names) ]) y_ = tf.placeholder(tf.float32, shape=[ None, ]) tf.summary.histogram('label', y_) global_step = tf.Variable(0, trainable=False) # variable_average = tf.train.ExponentialMovingAverage( # sub_Config.MOVING_AVERAGE_DECAY, # global_step # ) # vaeriable_average_op = variable_average.apply(tf.trainable_variables()) # regularizer = tf.contrib.layers.l2_regularizer(sub_Config.REGULARIZTION_RATE) is_training = tf.placeholder('bool', [], name='is_training') FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('data_dir', '/tmp/cifar-data', 'where to store the dataset') tf.app.flags.DEFINE_boolean( 'use_bn', True, 'use batch normalization. otherwise use biases') y = inference_small([x_ROI, x_EXPAND], is_training=is_training, num_classes=net_config.OUTPUT_NODE, use_bias=FLAGS.use_bn, phase_names=phases_names, num_blocks=3) tf.summary.histogram('logits', tf.argmax(y, 1)) loss_ = loss(logits=y, labels=tf.cast(y_, np.int32)) tf.summary.scalar('loss', loss_) with tf.variable_scope('accuracy'): accuracy_tensor = tf.reduce_mean( tf.cast(tf.equal(x=tf.argmax(y, 1), y=tf.cast(y_, tf.int64)), tf.float32)) tf.summary.scalar('accuracy', accuracy_tensor) saver = tf.train.Saver() merge_op = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if load_model_path: saver.restore(sess, load_model_path) validation_images, validation_images_expand, validation_labels = val_data_set.get_next_batch( ) validation_accuracy, validation_loss, summary, logits = sess.run( [accuracy_tensor, loss_, merge_op, y], feed_dict={ x_ROI: validation_images, x_EXPAND: validation_images_expand, y_: validation_labels }) calculate_acc_error(logits=np.argmax(logits, 1), label=validation_labels, show=True) binary_acc = acc_binary_acc( logits=np.argmax(logits, 1), label=validation_labels, ) print 'validation loss value is %g, accuracy is %g, binary_acc is %g' % \ (validation_loss, validation_accuracy, binary_acc)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): images, labels = resnet.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits, tensor_list = resnet.inference(images) # Calculate loss. loss = resnet.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op, _ = resnet.train(loss, tensor_list, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) start = time.time() with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) end = time.time() print(end - start)