Esempio n. 1
0
 def _benchmark_eager_apply(self,
                            label,
                            device_and_format,
                            defun=False,
                            execution_mode=None,
                            compiled=False):
   config = config_.get_hparams_imagenet_56()
   with tfe.execution_mode(execution_mode):
     device, data_format = device_and_format
     model = revnet.RevNet(config=config)
     if defun:
       model.call = tfe.defun(model.call, compiled=compiled)
     batch_size = 64
     num_burn = 5
     num_iters = 10
     with tf.device(device):
       images, _ = random_batch(batch_size, config)
       for _ in range(num_burn):
         model(images, training=False)
       if execution_mode:
         tfe.async_wait()
       gc.collect()
       start = time.time()
       for _ in range(num_iters):
         model(images, training=False)
       if execution_mode:
         tfe.async_wait()
       self._report(label, start, num_iters, device, batch_size, data_format)
Esempio n. 2
0
 def setUp(self):
     super(RevnetTest, self).setUp()
     config = config_.get_hparams_imagenet_56()
     shape = (config.batch_size, ) + config.input_shape
     self.model = revnet.RevNet(config=config)
     self.x = tf.random_normal(shape=shape)
     self.t = tf.random_uniform(shape=[config.batch_size],
                                minval=0,
                                maxval=config.n_classes,
                                dtype=tf.int32)
     self.config = config
Esempio n. 3
0
 def setUp(self):
   super(RevnetTest, self).setUp()
   config = config_.get_hparams_imagenet_56()
   shape = (config.batch_size,) + config.input_shape
   self.model = revnet.RevNet(config=config)
   self.x = tf.random_normal(shape=shape)
   self.t = tf.random_uniform(
       shape=[config.batch_size],
       minval=0,
       maxval=config.n_classes,
       dtype=tf.int32)
   self.config = config
Esempio n. 4
0
    def _benchmark_eager_train(self,
                               label,
                               make_iterator,
                               device_and_format,
                               defun=False,
                               execution_mode=None,
                               compiled=False):
        config = config_.get_hparams_imagenet_56()
        config.add_hparam("n_classes", 1000)
        config.add_hparam("dataset", "ImageNet")
        with tfe.execution_mode(execution_mode):
            device, data_format = device_and_format
            for batch_size in self._train_batch_sizes():
                (images, labels) = random_batch(batch_size, config)
                model = revnet.RevNet(config=config)
                optimizer = tf.train.GradientDescentOptimizer(0.1)
                if defun:
                    model.call = tfe.defun(model.call)

                num_burn = 3
                num_iters = 10
                with tf.device(device):
                    iterator = make_iterator((images, labels))
                    for _ in range(num_burn):
                        (images, labels) = iterator.next()
                        train_one_iter(model, images, labels, optimizer)
                    if execution_mode:
                        tfe.async_wait()
                    self._force_device_sync()
                    gc.collect()

                    start = time.time()
                    for _ in range(num_iters):
                        (images, labels) = iterator.next()
                        train_one_iter(model, images, labels, optimizer)
                    if execution_mode:
                        tfe.async_wait()
                    self._force_device_sync()
                    self._report(label, start, num_iters, device, batch_size,
                                 data_format)
Esempio n. 5
0
  def _benchmark_eager_train(self,
                             label,
                             make_iterator,
                             device_and_format,
                             defun=False,
                             execution_mode=None,
                             compiled=False):
    config = config_.get_hparams_imagenet_56()
    config.add_hparam("n_classes", 1000)
    config.add_hparam("dataset", "ImageNet")
    with tfe.execution_mode(execution_mode):
      device, data_format = device_and_format
      for batch_size in self._train_batch_sizes():
        (images, labels) = random_batch(batch_size, config)
        model = revnet.RevNet(config=config)
        optimizer = tf.train.GradientDescentOptimizer(0.1)
        if defun:
          model.call = tfe.defun(model.call)

        num_burn = 3
        num_iters = 10
        with tf.device(device):
          iterator = make_iterator((images, labels))
          for _ in range(num_burn):
            (images, labels) = iterator.next()
            train_one_iter(model, images, labels, optimizer)
          if execution_mode:
            tfe.async_wait()
          self._force_device_sync()
          gc.collect()

          start = time.time()
          for _ in range(num_iters):
            (images, labels) = iterator.next()
            train_one_iter(model, images, labels, optimizer)
          if execution_mode:
            tfe.async_wait()
          self._force_device_sync()
          self._report(label, start, num_iters, device, batch_size, data_format)
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    # RevNet specific configuration
    revnet_config = {
        "revnet-56": config_.get_hparams_imagenet_56(),
        "revnet-104": config_.get_hparams_imagenet_104()
    }[FLAGS.revnet_config]

    if FLAGS.use_tpu:
        revnet_config.data_format = "channels_last"

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    # Estimator specific configuration
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
            PER_HOST_V2),
    )

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(is_training=is_training,
                                     data_dir=FLAGS.data_dir,
                                     transpose_input=FLAGS.transpose_input,
                                     use_bfloat16=False)
        for is_training in [True, False]
    ]

    revnet_classifier = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=revnet_config.tpu_batch_size,
        eval_batch_size=revnet_config.tpu_eval_batch_size,
        config=config,
        export_to_tpu=False,
        params={"revnet_config": revnet_config})

    steps_per_epoch = revnet_config.tpu_iters_per_epoch
    eval_steps = revnet_config.tpu_eval_steps

    # pylint: disable=protected-access
    if FLAGS.mode == "eval":
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info("Starting to evaluate.")
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = revnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split("-")[1])
                if current_step >= revnet_config.max_train_iter:
                    tf.logging.info(
                        "Evaluation finished after training step %d" %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    "Checkpoint %s no longer exists, skipping checkpoint" %
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)

        tf.logging.info(
            "Training for %d steps (%.2f epochs in total). Current"
            " step %d." %
            (revnet_config.max_train_iter,
             revnet_config.max_train_iter / steps_per_epoch, current_step))

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == "train":
            revnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=revnet_config.max_train_iter)

        else:
            assert FLAGS.mode == "train_and_eval"
            while current_step < revnet_config.max_train_iter:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      revnet_config.max_train_iter)
                revnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    "Finished training up to step %d. Elapsed seconds %d." %
                    (next_checkpoint, int(time.time() - start_timestamp)))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info("Starting to evaluate.")
                eval_results = revnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn, steps=eval_steps)
                tf.logging.info("Eval results: %s" % eval_results)

                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info(
                    "Finished training up to step %d. Elapsed seconds %d." %
                    (revnet_config.max_train_iter, elapsed_time))

        if FLAGS.export_dir is not None:
            # The guide to serve an exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info("Starting to export model.")
            revnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Esempio n. 7
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  # RevNet specific configuration
  revnet_config = {
      "revnet-56": config_.get_hparams_imagenet_56(),
      "revnet-104": config_.get_hparams_imagenet_104()
  }[FLAGS.revnet_config]

  if FLAGS.use_tpu:
    revnet_config.data_format = "channels_last"

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  # Estimator specific configuration
  config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
          PER_HOST_V2),
  )

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  imagenet_train, imagenet_eval = [
      imagenet_input.ImageNetInput(
          is_training=is_training,
          data_dir=FLAGS.data_dir,
          transpose_input=FLAGS.transpose_input,
          use_bfloat16=False) for is_training in [True, False]
  ]

  revnet_classifier = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=revnet_config.tpu_batch_size,
      eval_batch_size=revnet_config.tpu_eval_batch_size,
      config=config,
      export_to_tpu=False,
      params={"revnet_config": revnet_config})

  steps_per_epoch = revnet_config.tpu_iters_per_epoch
  eval_steps = revnet_config.tpu_eval_steps

  # pylint: disable=protected-access
  if FLAGS.mode == "eval":
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info("Starting to evaluate.")
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = revnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                        (eval_results, elapsed_time))

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split("-")[1])
        if current_step >= revnet_config.max_train_iter:
          tf.logging.info(
              "Evaluation finished after training step %d" % current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)

  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)

    tf.logging.info(
        "Training for %d steps (%.2f epochs in total). Current"
        " step %d." % (revnet_config.max_train_iter,
                       revnet_config.max_train_iter / steps_per_epoch,
                       current_step))

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == "train":
      revnet_classifier.train(
          input_fn=imagenet_train.input_fn,
          max_steps=revnet_config.max_train_iter)

    else:
      assert FLAGS.mode == "train_and_eval"
      while current_step < revnet_config.max_train_iter:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              revnet_config.max_train_iter)
        revnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                        (next_checkpoint, int(time.time() - start_timestamp)))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info("Starting to evaluate.")
        eval_results = revnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn, steps=eval_steps)
        tf.logging.info("Eval results: %s" % eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                        (revnet_config.max_train_iter, elapsed_time))

    if FLAGS.export_dir is not None:
      # The guide to serve an exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info("Starting to export model.")
      revnet_classifier.export_savedmodel(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)