def _benchmark_eager_apply(self, label, device_and_format, defun=False, execution_mode=None, compiled=False): config = config_.get_hparams_imagenet_56() with tfe.execution_mode(execution_mode): device, data_format = device_and_format model = revnet.RevNet(config=config) if defun: model.call = tfe.defun(model.call, compiled=compiled) batch_size = 64 num_burn = 5 num_iters = 10 with tf.device(device): images, _ = random_batch(batch_size, config) for _ in range(num_burn): model(images, training=False) if execution_mode: tfe.async_wait() gc.collect() start = time.time() for _ in range(num_iters): model(images, training=False) if execution_mode: tfe.async_wait() self._report(label, start, num_iters, device, batch_size, data_format)
def setUp(self): super(RevnetTest, self).setUp() config = config_.get_hparams_imagenet_56() shape = (config.batch_size, ) + config.input_shape self.model = revnet.RevNet(config=config) self.x = tf.random_normal(shape=shape) self.t = tf.random_uniform(shape=[config.batch_size], minval=0, maxval=config.n_classes, dtype=tf.int32) self.config = config
def setUp(self): super(RevnetTest, self).setUp() config = config_.get_hparams_imagenet_56() shape = (config.batch_size,) + config.input_shape self.model = revnet.RevNet(config=config) self.x = tf.random_normal(shape=shape) self.t = tf.random_uniform( shape=[config.batch_size], minval=0, maxval=config.n_classes, dtype=tf.int32) self.config = config
def _benchmark_eager_train(self, label, make_iterator, device_and_format, defun=False, execution_mode=None, compiled=False): config = config_.get_hparams_imagenet_56() config.add_hparam("n_classes", 1000) config.add_hparam("dataset", "ImageNet") with tfe.execution_mode(execution_mode): device, data_format = device_and_format for batch_size in self._train_batch_sizes(): (images, labels) = random_batch(batch_size, config) model = revnet.RevNet(config=config) optimizer = tf.train.GradientDescentOptimizer(0.1) if defun: model.call = tfe.defun(model.call) num_burn = 3 num_iters = 10 with tf.device(device): iterator = make_iterator((images, labels)) for _ in range(num_burn): (images, labels) = iterator.next() train_one_iter(model, images, labels, optimizer) if execution_mode: tfe.async_wait() self._force_device_sync() gc.collect() start = time.time() for _ in range(num_iters): (images, labels) = iterator.next() train_one_iter(model, images, labels, optimizer) if execution_mode: tfe.async_wait() self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format)
def _benchmark_eager_train(self, label, make_iterator, device_and_format, defun=False, execution_mode=None, compiled=False): config = config_.get_hparams_imagenet_56() config.add_hparam("n_classes", 1000) config.add_hparam("dataset", "ImageNet") with tfe.execution_mode(execution_mode): device, data_format = device_and_format for batch_size in self._train_batch_sizes(): (images, labels) = random_batch(batch_size, config) model = revnet.RevNet(config=config) optimizer = tf.train.GradientDescentOptimizer(0.1) if defun: model.call = tfe.defun(model.call) num_burn = 3 num_iters = 10 with tf.device(device): iterator = make_iterator((images, labels)) for _ in range(num_burn): (images, labels) = iterator.next() train_one_iter(model, images, labels, optimizer) if execution_mode: tfe.async_wait() self._force_device_sync() gc.collect() start = time.time() for _ in range(num_iters): (images, labels) = iterator.next() train_one_iter(model, images, labels, optimizer) if execution_mode: tfe.async_wait() self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration revnet_config = { "revnet-56": config_.get_hparams_imagenet_56(), "revnet-104": config_.get_hparams_imagenet_104() }[FLAGS.revnet_config] if FLAGS.use_tpu: revnet_config.data_format = "channels_last" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator specific configuration config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2), ) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput(is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=False) for is_training in [True, False] ] revnet_classifier = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=revnet_config.tpu_batch_size, eval_batch_size=revnet_config.tpu_eval_batch_size, config=config, export_to_tpu=False, params={"revnet_config": revnet_config}) steps_per_epoch = revnet_config.tpu_iters_per_epoch eval_steps = revnet_config.tpu_eval_steps # pylint: disable=protected-access if FLAGS.mode == "eval": # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time( ) # This time will include compilation time eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= revnet_config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info( "Training for %d steps (%.2f epochs in total). Current" " step %d." % (revnet_config.max_train_iter, revnet_config.max_train_iter / steps_per_epoch, current_step)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == "train": revnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=revnet_config.max_train_iter) else: assert FLAGS.mode == "train_and_eval" while current_step < revnet_config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, revnet_config.max_train_iter) revnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( "Finished training up to step %d. Elapsed seconds %d." % (next_checkpoint, int(time.time() - start_timestamp))) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info("Starting to evaluate.") eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( "Finished training up to step %d. Elapsed seconds %d." % (revnet_config.max_train_iter, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve an exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info("Starting to export model.") revnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration revnet_config = { "revnet-56": config_.get_hparams_imagenet_56(), "revnet-104": config_.get_hparams_imagenet_104() }[FLAGS.revnet_config] if FLAGS.use_tpu: revnet_config.data_format = "channels_last" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator specific configuration config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2), ) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=False) for is_training in [True, False] ] revnet_classifier = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=revnet_config.tpu_batch_size, eval_batch_size=revnet_config.tpu_eval_batch_size, config=config, export_to_tpu=False, params={"revnet_config": revnet_config}) steps_per_epoch = revnet_config.tpu_iters_per_epoch eval_steps = revnet_config.tpu_eval_steps # pylint: disable=protected-access if FLAGS.mode == "eval": # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time() # This time will include compilation time eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= revnet_config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info( "Training for %d steps (%.2f epochs in total). Current" " step %d." % (revnet_config.max_train_iter, revnet_config.max_train_iter / steps_per_epoch, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == "train": revnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=revnet_config.max_train_iter) else: assert FLAGS.mode == "train_and_eval" while current_step < revnet_config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, revnet_config.max_train_iter) revnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (next_checkpoint, int(time.time() - start_timestamp))) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info("Starting to evaluate.") eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (revnet_config.max_train_iter, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve an exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info("Starting to export model.") revnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)