def eval_inception(): """Evalautes the inception model.""" g = tf.Graph() with g.as_default(): # pylint: disable=line-too-long images, one_hot_labels, num_samples, num_of_classes = cifar_data_provider.provide_cifarnet_data( FLAGS.dataset_name, FLAGS.split_name, FLAGS.batch_size, dataset_dir=FLAGS.data_dir, num_epochs=None) # Define the model: logits, end_points = inception_model.cifarnet(images, num_of_classes, is_training=False, dropout_keep_prob=1.0) images.set_shape([FLAGS.batch_size, 32, 32, 3]) predictions = tf.argmax(end_points['Predictions'], 1) total_loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) total_loss = tf.reduce_mean(total_loss, name='xent') slim.summaries.add_scalar_summary(total_loss, 'total_loss', print_summary=True) # Define the metrics: labels = tf.argmax(one_hot_labels, 1) names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'accuracy': tf.metrics.accuracy(predictions, labels), }) for name, value in names_to_values.iteritems(): slim.summaries.add_scalar_summary(value, name, prefix='eval', print_summary=True) # This ensures that we make a single pass over all of the data. num_batches = math.ceil(num_samples / float(FLAGS.batch_size)) # Limit gpu memory to run train and eval on the same gpu config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.45 slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.checkpoint_dir, logdir=FLAGS.eval_dir, num_evals=num_batches, session_config=config, eval_op=names_to_updates.values(), eval_interval_secs=FLAGS.eval_interval_secs)
def train_inception_mentornet(max_step_run): """Trains the mentornet with the student inception model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): # If ps_tasks is zero, the local device is used. When using multiple # (non-local) replicas, the ReplicaDeviceSetter distributes the variables # across the different devices. with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): config = tf.ConfigProto() # limit gpu memory to run train and eval on the same gpu config.gpu_options.per_process_gpu_memory_fraction = 0.8 tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_cifarnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) images.set_shape([FLAGS.batch_size, 32, 32, 3]) tf.logging.info('num_of_example=%s', num_samples_per_epoch) # Define the model: with slim.arg_scope( inception_model.cifarnet_arg_scope(weight_decay=0.004)): logits, _ = inception_model.cifarnet( images, num_of_classes, is_training=True, dropout_keep_prob=0.8) # Specify the loss function: loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) dropout_rates = utils.parse_dropout_rate_list(FLAGS.example_dropout_rates) example_dropout_rates = tf.convert_to_tensor( dropout_rates, np.float32, name='example_dropout_rates') loss_p_percentile = tf.convert_to_tensor( np.array([FLAGS.loss_p_percentile] * 100), np.float32, name='loss_p_percentile') epoch_step = tf.to_int32( tf.floor(tf.divide(tf_global_step, max_step_run) * 100)) zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32) loss = tf.reshape(loss, [-1, 1]) v = utils.mentornet( epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, fixed_epoch_after_burn_in=FLAGS.fixed_epoch_after_burn_in, loss_moving_average_decay=FLAGS.loss_moving_average_decay) tf.stop_gradient(v) # log data utilization data_util = utils.summarize_data_utilization(v, tf_global_step, FLAGS.batch_size) weighted_loss_vector = tf.multiply(loss, v) weighted_loss = tf.reduce_mean(weighted_loss_vector) slim.summaries.add_scalar_summary( tf.reduce_mean(loss), 'mentornet/orig_loss') slim.summaries.add_scalar_summary(weighted_loss, 'mentornet/weighted_loss') # normalize the decay loss based on v weighed_decay_loss = 0 weighted_total_loss = weighted_loss + weighed_decay_loss slim.summaries.add_scalar_summary(weighted_total_loss, 'mentornet/total_loss') slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss') tf.add_to_collection('total_loss', weighted_total_loss) decay_steps = int( num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) lr = tf.train.exponential_decay( FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) slim.summaries.add_scalar_summary(lr, 'learning_rate', print_summary=True) with tf.control_dependencies([weighted_total_loss, data_util]): # Set up training. trainable_variables = tf.trainable_variables() trainable_variables = tf.contrib.framework.filter_variables( trainable_variables, exclude_patterns=['mentornet']) # Specify the optimization scheme: optimizer = tf.train.GradientDescentOptimizer(lr) train_op = slim.learning.create_train_op( weighted_total_loss, optimizer, variables_to_train=trainable_variables) # Restore setup if FLAGS.trained_mentornet_dir is not None: ckpt_model = FLAGS.trained_mentornet_dir if os.path.isdir(FLAGS.trained_mentornet_dir): ckpt_model = tf.train.latest_checkpoint(ckpt_model) # fix the mentornet parameters variables_to_restore = slim.get_variables_to_restore( # TODO(lujiang): mentornet_inputs or mentor_inputs? include=['mentornet', 'mentornet_inputs']) iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint( ckpt_model, variables_to_restore) # Create an initial assignment function. def init_assign_fn(sess): tf.logging.info('Restore using customer initializer %s', '.' * 10) sess.run(iassign_op1, ifeed_dict1) else: init_assign_fn = None tf.logging.info('-' * 20 + 'MentorNet' + '-' * 20) tf.logging.info('loaded pretrained mentornet from %s', ckpt_model) tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile) tf.logging.info('burn_in_epoch=%d', FLAGS.burn_in_epoch) tf.logging.info('fixed_epoch_after_burn_in=%s', FLAGS.fixed_epoch_after_burn_in) tf.logging.info('loss_moving_average_decay=%3f', FLAGS.loss_moving_average_decay) tf.logging.info('example_dropout_rates %s', ','.join( str(t) for t in dropout_rates)) tf.logging.info('-' * 20) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=5) # Run training. slim.learning.train( train_op=train_op, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, saver=saver, session_config=config, number_of_steps=max_step_run, init_fn=init_assign_fn, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def train_inception_baseline(max_step_run): """Trains the inception baseline model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): # If ps_tasks is zero, the local device is used. When using multiple # (non-local) replicas, the ReplicaDeviceSetter distributes the variables # across the different devices. with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): config = tf.ConfigProto() # Limit gpu memory to run train and eval on the same gpu config.gpu_options.per_process_gpu_memory_fraction = 0.45 tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_cifarnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) tf.logging.info('num_of_example={}'.format(num_samples_per_epoch)) # Define the model: with slim.arg_scope( inception_model.cifarnet_arg_scope(weight_decay=0.004)): logits, _ = inception_model.cifarnet(images, num_of_classes, is_training=True, dropout_keep_prob=0.8) # Specify the loss function: total_loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) total_loss = tf.reduce_mean(total_loss) tf.contrib.deprecated.scalar_summary('Total Loss', total_loss) decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) lr = tf.train.exponential_decay(FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) slim.summaries.add_scalar_summary(lr, 'learning_rate', print_summary=True) # Specify the optimization scheme: optimizer = tf.train.GradientDescentOptimizer(lr) # Set up training. train_op = slim.learning.create_train_op(total_loss, optimizer) # Run training. slim.learning.train(train_op=train_op, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, session_config=config, number_of_steps=max_step_run, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)