def inference_on_image(img_path):
    with tf.Graph().as_default():
        num_classes = 1001

        coder = build_image_data.ImageCoder()

        image_buffer, _, _ = build_image_data._process_image(img_path, coder)

        image = image_processing.image_preprocessing(
            image_buffer, 0, False)  # image -> (299, 299, 3)
        image = tf.expand_dims(image, 0)  # (299, 299,3) -> (1, 299, 299, 3)

        logits, _ = inception.inference(image,
                                        num_classes,
                                        for_training=False,
                                        restore_logits=True)

        saver = tf.train.Saver()
        with tf.Session() as tf_session:
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
            if ckpt and ckpt.model_checkpoint_path:
                if os.path.isabs(ckpt.model_checkpoint_path):
                    # Restores from checkpoint with absolute path.
                    saver.restore(tf_session, ckpt.model_checkpoint_path)
                    # saver.restore(tf_session, "/tmp/train_back/model.ckpt-640")
                else:
                    # Restores from checkpoint with relative path.
                    saver.restore(
                        tf_session,
                        os.path.join(FLAGS.checkpoint_dir,
                                     ckpt.model_checkpoint_path))
            l = tf_session.run([logits])
            return l
def inference_on_multi_image():
    print("total image size {} ".format(file_size))

    total_batch_size = int(file_size / batch_size + 1)
    logit_list = []

    for n in range(total_batch_size):
        print("step :{} / {}".format(n + 1, total_batch_size))
        mini_batch = img_file_list[n * batch_size:(n + 1) * batch_size]
        mini_adarr = np.ndarray(shape=(0, 299, 299, 3))

        with tf.Graph().as_default():
            num_classes = 1001

            coder = build_image_data.ImageCoder()
            for i, image in enumerate(mini_batch):
                image_buffer, _, _ = build_image_data._process_image(
                    my_image_path + image, coder)
                image = image_processing.image_preprocessing(
                    image_buffer, 0, False)  # image -> (299, 299, 3)
                image = tf.expand_dims(image,
                                       0)  # (299, 299,3) -> (1, 299, 299, 3)
                mini_adarr = tf.concat([mini_adarr, image], 0)

            logits, _ = inception.inference(mini_adarr,
                                            num_classes,
                                            for_training=False,
                                            restore_logits=True)

            saver = tf.train.Saver()
            with tf.Session() as tf_session:

                ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
                if ckpt and ckpt.model_checkpoint_path:
                    if os.path.isabs(ckpt.model_checkpoint_path):
                        # Restores from checkpoint with absolute path.
                        saver.restore(tf_session, ckpt.model_checkpoint_path)
                        # saver.restore(tf_session, "/tmp/train_back/model.ckpt-640")
                    else:
                        # Restores from checkpoint with relative path.
                        saver.restore(
                            tf_session,
                            os.path.join(FLAGS.checkpoint_dir,
                                         ckpt.model_checkpoint_path))

                l = tf_session.run([logits])
                for ll in l[0]:
                    logit_list.append(ll)

    return logit_list
Example #3
0
def train(target, dataset, cluster_spec, ctx):
    """Train Inception on a dataset for a number of steps."""
    # Number of workers and parameter servers are infered from the workers and ps
    # hosts string.
    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])
    # If no value is given, num_replicas_to_aggregate defaults to be the number of
    # workers.
    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    # Both should be greater than 0 in a distributed training.
    assert num_workers > 0 and num_parameter_servers > 0, (
        ' num_workers and '
        'num_parameter_servers'
        ' must be > 0.')

    # Choose worker 0 as the chief. Note that any worker could be the chief
    # but there should be only one chief.
    is_chief = (FLAGS.task_id == 0)

    # Ops are assigned to worker by default.
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        # Variables and its related init/assign ops are assigned to ps.
        with slim.scopes.arg_scope(
            [slim.variables.variable, slim.variables.global_step],
                device=slim.variables.VariableDeviceChooser(
                    num_parameter_servers)):
            # Create a variable to count the number of train() calls. This equals the
            # number of updates applied to the variables.
            global_step = slim.variables.global_step()

            # Calculate the learning rate schedule.
            num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                     FLAGS.batch_size)
            # Decay steps need to be divided by the number of replicas to aggregate.
            decay_steps = int(num_batches_per_epoch *
                              FLAGS.num_epochs_per_decay /
                              num_replicas_to_aggregate)

            # Decay the learning rate exponentially based on the number of steps.
            lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                            global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            # Add a summary to track the learning rate.
            tf.summary.scalar('learning_rate', lr)

            # Create an optimizer that performs gradient descent.
            opt = tf.train.RMSPropOptimizer(lr,
                                            RMSPROP_DECAY,
                                            momentum=RMSPROP_MOMENTUM,
                                            epsilon=RMSPROP_EPSILON)

            if FLAGS.input_mode == 'spark':

                def feed_dict(mgr, batch_size):
                    tmp = TFNode.next_batch(mgr, batch_size)
                    # extract TFRecords, since tmp array is [(TFRecord, None)]
                    tfrecords = []
                    for elem in tmp:
                        tfrecords.append(str(elem[0]))
                    return tfrecords

                batch = tf.placeholder(
                    tf.string,
                    [FLAGS.batch_size / FLAGS.num_preprocess_threads])

                # The following is adapted from image_processing.py to remove Readers/QueueRunners.
                # Note: this removes the RandomShuffledQueue, so the incoming data is not shuffled.
                # Presumably, this could be done on the Spark side or done in additional TF code.
                examples = tf.unpack(batch)
                images, labels = [], []
                for example_serialized in examples:
                    for thread_id in range(FLAGS.num_preprocess_threads):
                        # Parse a serialized Example proto to extract the image and metadata.
                        image_buffer, label_index, bbox, _ = image_processing.parse_example_proto(
                            example_serialized)
                        image = image_processing.image_preprocessing(
                            image_buffer, bbox, train, thread_id)
                        images.append(image)
                        labels.append(label_index)
                height = FLAGS.image_size
                width = FLAGS.image_size
                depth = 3
                images = tf.cast(images, tf.float32)
                images = tf.reshape(
                    images, shape=[FLAGS.batch_size, height, width, depth])
                tf.summary.image('images', images)
                labels = tf.reshape(labels, [FLAGS.batch_size])
            else:
                images, labels = image_processing.distorted_inputs(
                    dataset,
                    batch_size=FLAGS.batch_size,
                    num_preprocess_threads=FLAGS.num_preprocess_threads)

            # Number of classes in the Dataset label set plus 1.
            # Label 0 is reserved for an (unused) background class.
            num_classes = dataset.num_classes() + 1
            logits = inception.inference(images,
                                         num_classes,
                                         for_training=True)
            # Add classification loss.
            inception.loss(logits, labels)

            # Gather all of the losses including regularization losses.
            losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
            losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

            total_loss = tf.add_n(losses, name='total_loss')

            if is_chief:
                # Compute the moving average of all individual losses and the
                # total loss.
                loss_averages = tf.train.ExponentialMovingAverage(0.9,
                                                                  name='avg')
                loss_averages_op = loss_averages.apply(losses + [total_loss])

                # Attach a scalar summmary to all individual losses and the total loss;
                # do the same for the averaged version of the losses.
                for l in losses + [total_loss]:
                    loss_name = l.op.name
                    # Name each loss as '(raw)' and name the moving average version of the
                    # loss as the original loss name.
                    tf.summary.scalar(loss_name + ' (raw)', l)
                    tf.summary.scalar(loss_name, loss_averages.average(l))

                # Add dependency to compute loss_averages.
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)

            # Track the moving averages of all trainable variables.
            # Note that we maintain a 'double-average' of the BatchNormalization
            # global statistics.
            # This is not needed when the number of replicas are small but important
            # for synchronous distributed training with tens of workers/replicas.
            exp_moving_averager = tf.train.ExponentialMovingAverage(
                inception.MOVING_AVERAGE_DECAY, global_step)

            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())

            # Add histograms for model variables.
            for var in variables_to_average:
                tf.summary.histogram(var.op.name, var)

            # Create synchronous replica optimizer.
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers,
                variable_averages=exp_moving_averager,
                variables_to_average=variables_to_average)

            batchnorm_updates = tf.get_collection(
                slim.ops.UPDATE_OPS_COLLECTION)
            assert batchnorm_updates, 'Batchnorm updates are missing'
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            # Add dependency to compute batchnorm_updates.
            with tf.control_dependencies([batchnorm_updates_op]):
                total_loss = tf.identity(total_loss)

            # Compute gradients with respect to the loss.
            grads = opt.compute_gradients(total_loss)

            # Add histograms for gradients.
            for grad, var in grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/gradients', grad)

            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)

            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')

            # Get chief queue_runners, init_tokens and clean_up_op, which is used to
            # synchronize replicas.
            # More details can be found in sync_replicas_optimizer.
            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()

            # Create a saver.
            saver = tf.train.Saver()

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.summary.merge_all()

            # Build an initialization operation to run below.
            init_op = tf.global_variables_initializer()

            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            summary_writer = tf.summary.FileWriter(
                "tensorboard_%d" % (ctx.worker_num),
                graph=tf.get_default_graph())
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     summary_writer=summary_writer,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            # Get a session.
            sess = sv.prepare_or_wait_for_session(target, config=sess_config)

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            if is_chief:
                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)

            # Train, checking for Nans. Concurrently run the summary operation at a
            # specified interval. Note that the summary_op and train_op never run
            # simultaneously in order to prevent running out of GPU memory.
            next_summary_time = time.time() + FLAGS.save_summaries_secs
            while not sv.should_stop():
                try:
                    start_time = time.time()
                    if FLAGS.input_mode == 'spark':
                        tmp = feed_dict(
                            ctx.mgr,
                            FLAGS.batch_size / FLAGS.num_preprocess_threads)
                        feed = {batch: tmp}
                        loss_value, step = sess.run([train_op, global_step],
                                                    feed_dict=feed)
                    else:
                        loss_value, step = sess.run([train_op, global_step])
                    assert not np.isnan(
                        loss_value), 'Model diverged with loss = NaN'
                    if step > FLAGS.max_steps:
                        break
                    duration = time.time() - start_time

                    if step % 30 == 0:
                        examples_per_sec = FLAGS.batch_size / float(duration)
                        format_str = ('Worker %d: %s: step %d, loss = %.2f'
                                      '(%.1f examples/sec; %.3f  sec/batch)')
                        tf.logging.info(
                            format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

                    # Determine if the summary_op should be run on the chief worker.
                    if FLAGS.input_mode == 'tf' and is_chief and next_summary_time < time.time(
                    ):
                        tf.logging.info(
                            'Running Summary operation on the chief.')
                        summary_str = sess.run(summary_op)
                        sv.summary_computed(sess, summary_str)
                        tf.logging.info('Finished running Summary operation.')

                        # Determine the next time for running the summary.
                        next_summary_time += FLAGS.save_summaries_secs
                except:
                    if is_chief:
                        tf.logging.info('About to execute sync_clean_up_op!')
                    raise

            # Stop the TFNode data feed
            if FLAGS.input_mode == 'spark':
                TFNode.terminate(ctx.mgr)

            # Stop the supervisor.  This also waits for service threads to finish.
            sv.stop()

            # Save after the training ends.
            if is_chief:
                saver.save(sess,
                           os.path.join(FLAGS.train_dir, 'model.ckpt'),
                           global_step=global_step)
def train(target, dataset, cluster_spec, ctx):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are infered from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  # Ops are assigned to worker by default.
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
    # Variables and its related init/assign ops are assigned to ps.
    with slim.scopes.arg_scope(
        [slim.variables.variable, slim.variables.global_step],
        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
      global_step = slim.variables.global_step()

      # Calculate the learning rate schedule.
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
      tf.summary.scalar('learning_rate', lr)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)

      if FLAGS.input_mode == 'spark':
        def feed_dict(mgr, batch_size):
          tmp = TFNode.next_batch(mgr, batch_size)
          # extract TFRecords, since tmp array is [(TFRecord, None)]
          tfrecords = []
          for elem in tmp:
            tfrecords.append(str(elem[0]))
          return tfrecords

        batch = tf.placeholder(tf.string, [FLAGS.batch_size/FLAGS.num_preprocess_threads])

        # The following is adapted from image_processing.py to remove Readers/QueueRunners.
        # Note: this removes the RandomShuffledQueue, so the incoming data is not shuffled.
        # Presumably, this could be done on the Spark side or done in additional TF code.
        examples = tf.unpack(batch)
        images, labels = [], []
        for example_serialized in examples:
          for thread_id in range(FLAGS.num_preprocess_threads):
            # Parse a serialized Example proto to extract the image and metadata.
            image_buffer, label_index, bbox, _ = image_processing.parse_example_proto(example_serialized)
            image = image_processing.image_preprocessing(image_buffer, bbox, train, thread_id)
            images.append(image)
            labels.append(label_index)
        height = FLAGS.image_size
        width = FLAGS.image_size
        depth = 3
        images = tf.cast(images, tf.float32)
        images = tf.reshape(images, shape=[FLAGS.batch_size, height, width, depth])
        tf.summary.image('images', images)
        labels = tf.reshape(labels, [FLAGS.batch_size])
      else:
        images, labels = image_processing.distorted_inputs(
            dataset,
            batch_size=FLAGS.batch_size,
            num_preprocess_threads=FLAGS.num_preprocess_threads)

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
      num_classes = dataset.num_classes() + 1
      logits = inception.inference(images, num_classes, for_training=True)
      # Add classification loss.
      inception.loss(logits, labels)

      # Gather all of the losses including regularization losses.
      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

      total_loss = tf.add_n(losses, name='total_loss')

      if is_chief:
        # Compute the moving average of all individual losses and the
        # total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # Attach a scalar summmary to all individual losses and the total loss;
        # do the same for the averaged version of the losses.
        for l in losses + [total_loss]:
          loss_name = l.op.name
          # Name each loss as '(raw)' and name the moving average version of the
          # loss as the original loss name.
          tf.summary.scalar(loss_name + ' (raw)', l)
          tf.summary.scalar(loss_name, loss_averages.average(l))

        # Add dependency to compute loss_averages.
        with tf.control_dependencies([loss_averages_op]):
          total_loss = tf.identity(total_loss)

      # Track the moving averages of all trainable variables.
      # Note that we maintain a 'double-average' of the BatchNormalization
      # global statistics.
      # This is not needed when the number of replicas are small but important
      # for synchronous distributed training with tens of workers/replicas.
      exp_moving_averager = tf.train.ExponentialMovingAverage(
          inception.MOVING_AVERAGE_DECAY, global_step)

      variables_to_average = (
          tf.trainable_variables() + tf.moving_average_variables())

      # Add histograms for model variables.
      for var in variables_to_average:
        tf.summary.histogram(var.op.name, var)

      # Create synchronous replica optimizer.
      opt = tf.train.SyncReplicasOptimizer(
          opt,
          replicas_to_aggregate=num_replicas_to_aggregate,
          total_num_replicas=num_workers,
          variable_averages=exp_moving_averager,
          variables_to_average=variables_to_average)

      batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
      assert batchnorm_updates, 'Batchnorm updates are missing'
      batchnorm_updates_op = tf.group(*batchnorm_updates)
      # Add dependency to compute batchnorm_updates.
      with tf.control_dependencies([batchnorm_updates_op]):
        total_loss = tf.identity(total_loss)

      # Compute gradients with respect to the loss.
      grads = opt.compute_gradients(total_loss)

      # Add histograms for gradients.
      for grad, var in grads:
        if grad is not None:
          tf.summary.histogram(var.op.name + '/gradients', grad)

      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners, init_tokens and clean_up_op, which is used to
      # synchronize replicas.
      # More details can be found in sync_replicas_optimizer.
      chief_queue_runners = [opt.get_chief_queue_runner()]
      init_tokens_op = opt.get_init_tokens_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
      summary_op = tf.summary.merge_all()

      # Build an initialization operation to run below.
      init_op = tf.global_variables_initializer()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph())
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               summary_writer=summary_writer,
                               saver=saver,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

      if is_chief:
        sv.start_queue_runners(sess, chief_queue_runners)
        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
      next_summary_time = time.time() + FLAGS.save_summaries_secs
      while not sv.should_stop():
        try:
          start_time = time.time()
          if FLAGS.input_mode == 'spark':
            tmp = feed_dict(ctx.mgr, FLAGS.batch_size/FLAGS.num_preprocess_threads)
            feed = {batch: tmp}
            loss_value, step = sess.run([train_op, global_step], feed_dict=feed)
          else:
            loss_value, step = sess.run([train_op, global_step])
          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time

          if step % 30 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
          if FLAGS.input_mode == 'tf' and is_chief and next_summary_time < time.time():
            tf.logging.info('Running Summary operation on the chief.')
            summary_str = sess.run(summary_op)
            sv.summary_computed(sess, summary_str)
            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('About to execute sync_clean_up_op!')
          raise

      # Stop the TFNode data feed
      if FLAGS.input_mode == 'spark':
        TFNode.terminate(ctx.mgr)

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()

      # Save after the training ends.
      if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)