def load(graph, sess): with graph.as_default(): raw_images_op = tf.placeholder(tf.float32, [batch_size, 256, 256]) images = tf.expand_dims(raw_images_op, 3) labels = tf.placeholder(tf.float32, [batch_size, num_tags]) # after reading raw images, first resize to fit the model, then normalize the data # resize images = tf.image.resize_images( images, np.array([model_img_size, model_img_size])) # normalize std_images = [] for idx in range(batch_size): std_image = tf.image.per_image_standardization( images[idx, :, :, :]) std_image = tf.expand_dims(std_image, 0) std_images.append(std_image) images = tf.concat(std_images, 0) # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images, is_training=False, num_classes=num_tags) # Calculate predictions. prob_op = tf.sigmoid(logits) # Restore the moving average version of the learned variables for eval. saver = tf.train.Saver(tf.global_variables()) print('load from pretrained model from') print(model_checkpoint_path) saver.restore(sess, model_checkpoint_path) return prob_op, raw_images_op
def evaluate(): os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpuid) with tf.Graph().as_default() as g: images, labels = data_input.inputs(data_dir=FLAGS.data_dir, batch_size=FLAGS.eval_batch_size, num_tags=num_tags) # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images, is_training=False, num_classes=num_tags) # Calculate predictions. prob_op = tf.sigmoid(logits) # Restore the moving average version of the learned variables for eval. saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(eval_dir, g) while True: eval_once(saver, summary_writer, prob_op, labels, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
testrecord_images = tf.stack(testrecord_images) #transpose to set the channel first testrecord_images = tf.transpose(testrecord_images, perm=[0, 3, 1, 2]) global_step = tf.Variable(0, trainable=False) boundaries = [10000, 15000, 20000, 25000] values = [0.1, 0.05, 0.01, 0.005, 0.001] learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) weight_decay = 2e-4 filters = 16 #the first resnet block filter number n = 5 #the basic resnet block number, total network layers are 6n+2 ver = 2 #the resnet block version #Get the inference logits by the model result = resnet_model.inference(distorted_images, True, filters, n, ver) #Calculate the cross entropy loss cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=record_labels, logits=result) cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') #Add the l2 weights to the loss #Add weight decay to the loss. l2_loss = weight_decay * tf.add_n( # loss is computed using fp32 for numerical stability. [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()]) tf.summary.scalar('l2_loss', l2_loss) loss = cross_entropy_mean + l2_loss #Define the optimizer
def train(): os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpuid) with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels images, labels = data_input.distorted_inputs(data_dir=FLAGS.data_dir, batch_size=FLAGS.batch_size, num_tags=num_tags) # Build a Graph that computes the logits predictions from the # inference model. logits = model.inference(images, is_training=True, num_classes=num_tags) # Calculate loss. loss = model.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() config = tf.ConfigProto() config.log_device_placement=FLAGS.log_device_placement config.gpu_options.allow_growth = True # Start running operations on the Graph. sess = tf.Session(config=config) sess.run(init) step_init = 0 if FLAGS.startep > 0: ckpt = tf.train.get_checkpoint_state(train_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint print('load from pretrained model') saver.restore(sess, ckpt.model_checkpoint_path) # extract global_step from it. step_init = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') return else: print('random initialize the model') # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(train_dir, sess.graph) step_per_epoch = num_training_images / FLAGS.batch_size print('step per epoch: %d' % step_per_epoch) for step in np.arange(step_init, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % (step_per_epoch/2) == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % step_per_epoch == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % step_per_epoch == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(train_dir, 'model.ckpt') epoch_num = int(step / step_per_epoch) saver.save(sess, checkpoint_path, global_step=epoch_num)
def run_training(mode, num_classes, train_file, test_file): '''mode: 1-LGM loss, 2-softmax loss, 3-center loss ''' print('mode=%d' % mode) print('data_dir=%s' % FLAGS.data_dir) print('train file=%s' % train_file) with tf.Graph().as_default() as g: # Generate placeholders for the images and labels. images_placeholder, labels_placeholder, lr = placeholder_inputs( FLAGS.batch_size, _HEIGHT, _WIDTH, 3) # data img_train_h5 = read_h5(FLAGS.data_dir + train_file, 'data') label_train_h5 = read_h5(FLAGS.data_dir + train_file, 'label') ## Build a Graph that computes predictions from the inference model. # normal is_training = True if mode == 1: logits, likelihood_reg, means = resnet_model.inference_lgm( images_placeholder, FLAGS.resnet_size, is_training, labels=labels_placeholder, num_classes=num_classes) # lgm loss elif mode == 2: logits = resnet_model.inference(images_placeholder, FLAGS.resnet_size, is_training, num_classes=num_classes) # softmax elif mode == 3: logits, likelihood_reg, centers, centers_op = resnet_model.inference_center( images_placeholder, FLAGS.resnet_size, is_training, labels=labels_placeholder, loss_weight=0.0005, num_classes=num_classes) # center loss ## Add to the Graph the Ops for loss calculation. cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.to_int64(labels_placeholder), logits=logits, name='xentropy')) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) tf.summary.scalar('cross-entropy', cross_entropy) if mode != 2: tf.summary.scalar('likelihood_reg', likelihood_reg) loss += likelihood_reg optimizer = tf.train.MomentumOptimizer(lr, _MOMENTUM, use_nesterov=True) global_step = tf.Variable(0, name='global_step', trainable=False) # Batch norm requires update ops to be added as a dependency to the train_op update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) depend_ops = update_ops if mode == 3: depend_ops += [centers_op] with tf.control_dependencies(depend_ops): train_op = optimizer.minimize(loss, global_step) # evaluation is_training = True # The tf.layers.batch_normalization works properly only when training=True. Reasons unknown. if mode == 1: logits_eval, _, _ = resnet_model.inference_lgm( images_placeholder, FLAGS.resnet_size, is_training, reuse=True, num_classes=num_classes) # lgm elif mode == 2: logits_eval = resnet_model.inference( images_placeholder, FLAGS.resnet_size, is_training, reuse=True, num_classes=num_classes) # softmax elif mode == 3: logits_eval, _, _ = resnet_model.inference_center( images_placeholder, FLAGS.resnet_size, is_training, reuse=True, num_classes=num_classes) # center loss correct = tf.nn.in_top_k(logits_eval, labels_placeholder, 1) eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=None) # Create a session for running Ops on the Graph. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) sum_op = tf.summary.merge_all() sess.run(init) # load pre-trained if False: save_path = 'model.ckpt-10000' saver.restore(sess, save_path) print('\n[*]model loaded from %s\n' % save_path) # Start the training loop. print('use %d images to train' % _NUM_IMAGES['train']) print('trian epochs: %d' % FLAGS.train_epochs) steps_per_epoch = _NUM_IMAGES['train'] // FLAGS.batch_size lr_value = 0.1 g.finalize() for epc in range(FLAGS.train_epochs): idxArr = np.random.permutation(_NUM_IMAGES['train']) # shuffle img_train = img_train_h5[idxArr] label_train = label_train_h5[idxArr] if epc in [150, 225]: lr_value *= 0.1 print('lr changed to %f' % lr_value) for step in range(steps_per_epoch): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(img_train, label_train, step, FLAGS.batch_size, True, images_placeholder, labels_placeholder, MAX=_NUM_IMAGES['train']) feed_dict[lr] = lr_value _, crosse_entropy_, sum_str, gs = sess.run( [train_op, cross_entropy, sum_op, global_step], feed_dict=feed_dict) summary_writer.add_summary(sum_str, gs) summary_writer.flush() duration = time.time() - start_time # Write the summaries and print an overview fairly often. if gs % 100 == 0: print( '(Epoch %d) GlobalStep %d: loss = %.3f (%.3f sec/step)' % (epc + 1, gs, crosse_entropy_, duration)) # Save a checkpoint and evaluate the model periodically. if gs % 1000 == 0 or gs == 1: checkpoint_file = os.path.join(FLAGS.model_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=gs) print('model saved to %s' % checkpoint_file) # Evaluate against the validation set. print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, FLAGS.data_dir + test_file)