Exemple #1
0
def eval_inception():
    """Evalautes the inception model."""
    g = tf.Graph()
    with g.as_default():
        # pylint: disable=line-too-long
        images, one_hot_labels, num_samples, num_of_classes = cifar_data_provider.provide_cifarnet_data(
            FLAGS.dataset_name,
            FLAGS.split_name,
            FLAGS.batch_size,
            dataset_dir=FLAGS.data_dir,
            num_epochs=None)

        # Define the model:
        logits, end_points = inception_model.cifarnet(images,
                                                      num_of_classes,
                                                      is_training=False,
                                                      dropout_keep_prob=1.0)
        images.set_shape([FLAGS.batch_size, 32, 32, 3])

        predictions = tf.argmax(end_points['Predictions'], 1)

        total_loss = tf.nn.softmax_cross_entropy_with_logits(
            labels=one_hot_labels, logits=logits)
        total_loss = tf.reduce_mean(total_loss, name='xent')
        slim.summaries.add_scalar_summary(total_loss,
                                          'total_loss',
                                          print_summary=True)

        # Define the metrics:
        labels = tf.argmax(one_hot_labels, 1)
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'accuracy':
            tf.metrics.accuracy(predictions, labels),
        })

        for name, value in names_to_values.iteritems():
            slim.summaries.add_scalar_summary(value,
                                              name,
                                              prefix='eval',
                                              print_summary=True)

        # This ensures that we make a single pass over all of the data.
        num_batches = math.ceil(num_samples / float(FLAGS.batch_size))

        # Limit gpu memory to run train and eval on the same gpu
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.45

        slim.evaluation.evaluation_loop(
            master=FLAGS.master,
            checkpoint_dir=FLAGS.checkpoint_dir,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            session_config=config,
            eval_op=names_to_updates.values(),
            eval_interval_secs=FLAGS.eval_interval_secs)
Exemple #2
0
def train_inception_mentornet(max_step_run):
  """Trains the mentornet with the student inception model.

  Args:
    max_step_run: The maximum number of gradient steps.
  """
  if not os.path.exists(FLAGS.train_log_dir):
    os.makedirs(FLAGS.train_log_dir)
  g = tf.Graph()

  with g.as_default():
    # If ps_tasks is zero, the local device is used. When using multiple
    # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
    # across the different devices.
    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
      config = tf.ConfigProto()
      # limit gpu memory to run train and eval on the same gpu
      config.gpu_options.per_process_gpu_memory_fraction = 0.8

      tf_global_step = tf.train.get_or_create_global_step()

      # pylint: disable=line-too-long
      images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_cifarnet_data(
          FLAGS.dataset_name,
          'train',
          FLAGS.batch_size,
          dataset_dir=FLAGS.data_dir)

      images.set_shape([FLAGS.batch_size, 32, 32, 3])
      tf.logging.info('num_of_example=%s', num_samples_per_epoch)

      # Define the model:
      with slim.arg_scope(
          inception_model.cifarnet_arg_scope(weight_decay=0.004)):
        logits, _ = inception_model.cifarnet(
            images, num_of_classes, is_training=True, dropout_keep_prob=0.8)

      # Specify the loss function:
      loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=one_hot_labels, logits=logits)

      dropout_rates = utils.parse_dropout_rate_list(FLAGS.example_dropout_rates)
      example_dropout_rates = tf.convert_to_tensor(
          dropout_rates, np.float32, name='example_dropout_rates')

      loss_p_percentile = tf.convert_to_tensor(
          np.array([FLAGS.loss_p_percentile] * 100),
          np.float32,
          name='loss_p_percentile')

      epoch_step = tf.to_int32(
          tf.floor(tf.divide(tf_global_step, max_step_run) * 100))

      zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32)

      loss = tf.reshape(loss, [-1, 1])

      v = utils.mentornet(
          epoch_step,
          loss,
          zero_labels,
          loss_p_percentile,
          example_dropout_rates,
          burn_in_epoch=FLAGS.burn_in_epoch,
          fixed_epoch_after_burn_in=FLAGS.fixed_epoch_after_burn_in,
          loss_moving_average_decay=FLAGS.loss_moving_average_decay)

      tf.stop_gradient(v)

      # log data utilization
      data_util = utils.summarize_data_utilization(v, tf_global_step,
                                                   FLAGS.batch_size)

      weighted_loss_vector = tf.multiply(loss, v)

      weighted_loss = tf.reduce_mean(weighted_loss_vector)

      slim.summaries.add_scalar_summary(
          tf.reduce_mean(loss), 'mentornet/orig_loss')
      slim.summaries.add_scalar_summary(weighted_loss,
                                        'mentornet/weighted_loss')

      # normalize the decay loss based on v
      weighed_decay_loss = 0
      weighted_total_loss = weighted_loss + weighed_decay_loss

      slim.summaries.add_scalar_summary(weighted_total_loss,
                                        'mentornet/total_loss')

      slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss')
      tf.add_to_collection('total_loss', weighted_total_loss)

      decay_steps = int(
          num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay)

      lr = tf.train.exponential_decay(
          FLAGS.learning_rate,
          tf_global_step,
          decay_steps,
          FLAGS.learning_rate_decay_factor,
          staircase=True)
      slim.summaries.add_scalar_summary(lr, 'learning_rate', print_summary=True)

      with tf.control_dependencies([weighted_total_loss, data_util]):
        # Set up training.
        trainable_variables = tf.trainable_variables()
        trainable_variables = tf.contrib.framework.filter_variables(
            trainable_variables, exclude_patterns=['mentornet'])

        # Specify the optimization scheme:
        optimizer = tf.train.GradientDescentOptimizer(lr)
        train_op = slim.learning.create_train_op(
            weighted_total_loss,
            optimizer,
            variables_to_train=trainable_variables)

      # Restore setup
      if FLAGS.trained_mentornet_dir is not None:
        ckpt_model = FLAGS.trained_mentornet_dir
        if os.path.isdir(FLAGS.trained_mentornet_dir):
          ckpt_model = tf.train.latest_checkpoint(ckpt_model)

        # fix the mentornet parameters
        variables_to_restore = slim.get_variables_to_restore(
            # TODO(lujiang): mentornet_inputs or mentor_inputs?
            include=['mentornet', 'mentornet_inputs'])
        iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint(
            ckpt_model, variables_to_restore)

        # Create an initial assignment function.
        def init_assign_fn(sess):
          tf.logging.info('Restore using customer initializer %s', '.' * 10)
          sess.run(iassign_op1, ifeed_dict1)
      else:
        init_assign_fn = None

      tf.logging.info('-' * 20 + 'MentorNet' + '-' * 20)
      tf.logging.info('loaded pretrained mentornet from %s', ckpt_model)
      tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile)
      tf.logging.info('burn_in_epoch=%d', FLAGS.burn_in_epoch)
      tf.logging.info('fixed_epoch_after_burn_in=%s',
                      FLAGS.fixed_epoch_after_burn_in)
      tf.logging.info('loss_moving_average_decay=%3f',
                      FLAGS.loss_moving_average_decay)
      tf.logging.info('example_dropout_rates %s', ','.join(
          str(t) for t in dropout_rates))
      tf.logging.info('-' * 20)

      saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=5)

      # Run training.
      slim.learning.train(
          train_op=train_op,
          logdir=FLAGS.train_log_dir,
          master=FLAGS.master,
          is_chief=FLAGS.task == 0,
          saver=saver,
          session_config=config,
          number_of_steps=max_step_run,
          init_fn=init_assign_fn,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs)
def train_inception_baseline(max_step_run):
    """Trains the inception baseline model.

  Args:
    max_step_run: The maximum number of gradient steps.
  """
    if not os.path.exists(FLAGS.train_log_dir):
        os.makedirs(FLAGS.train_log_dir)
    g = tf.Graph()

    with g.as_default():
        # If ps_tasks is zero, the local device is used. When using multiple
        # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
        # across the different devices.
        with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
            config = tf.ConfigProto()
            # Limit gpu memory to run train and eval on the same gpu
            config.gpu_options.per_process_gpu_memory_fraction = 0.45

            tf_global_step = tf.train.get_or_create_global_step()

            # pylint: disable=line-too-long
            images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_cifarnet_data(
                FLAGS.dataset_name,
                'train',
                FLAGS.batch_size,
                dataset_dir=FLAGS.data_dir)

            tf.logging.info('num_of_example={}'.format(num_samples_per_epoch))
            # Define the model:
            with slim.arg_scope(
                    inception_model.cifarnet_arg_scope(weight_decay=0.004)):
                logits, _ = inception_model.cifarnet(images,
                                                     num_of_classes,
                                                     is_training=True,
                                                     dropout_keep_prob=0.8)

            # Specify the loss function:
            total_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=one_hot_labels, logits=logits)
            total_loss = tf.reduce_mean(total_loss)

            tf.contrib.deprecated.scalar_summary('Total Loss', total_loss)

            decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                              FLAGS.num_epochs_per_decay)

            lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                            tf_global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            slim.summaries.add_scalar_summary(lr,
                                              'learning_rate',
                                              print_summary=True)

            # Specify the optimization scheme:
            optimizer = tf.train.GradientDescentOptimizer(lr)

            # Set up training.
            train_op = slim.learning.create_train_op(total_loss, optimizer)

            # Run training.
            slim.learning.train(train_op=train_op,
                                logdir=FLAGS.train_log_dir,
                                master=FLAGS.master,
                                is_chief=FLAGS.task == 0,
                                session_config=config,
                                number_of_steps=max_step_run,
                                save_summaries_secs=FLAGS.save_summaries_secs,
                                save_interval_secs=FLAGS.save_interval_secs)