Ejemplo n.º 1
0
  def testReturnsEmptyIfNoCheckpointsFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'no_checkpoints_found')

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 0)
Ejemplo n.º 2
0
  def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'one_checkpoint_found_sharded')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()

    # This will result in 3 different checkpoint shard files.
    with ops.device('/cpu:0'):
      variables_lib.Variable(10, name='v0')
    with ops.device('/cpu:1'):
      variables_lib.Variable(20, name='v1')

    saver = saver_lib.Saver(sharded=True)

    with session_lib.Session(
        target='',
        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as session:

      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
Ejemplo n.º 3
0
    def testReturnsEmptyIfNoCheckpointsFound(self):
        checkpoint_dir = os.path.join(self.get_temp_dir(),
                                      'no_checkpoints_found')

        num_found = 0
        for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
            num_found += 1
        self.assertEqual(num_found, 0)
Ejemplo n.º 4
0
  def testTimeoutFn(self):
    timeout_fn_calls = [0]
    def timeout_fn():
      timeout_fn_calls[0] += 1
      return timeout_fn_calls[0] > 3

    results = list(
        evaluation.checkpoints_iterator(
            '/non-existent-dir', timeout=0.1, timeout_fn=timeout_fn))
    self.assertEqual([], results)
    self.assertEqual(4, timeout_fn_calls[0])
Ejemplo n.º 5
0
    def testTimeoutFn(self):
        timeout_fn_calls = [0]

        def timeout_fn():
            timeout_fn_calls[0] += 1
            return timeout_fn_calls[0] > 3

        results = list(
            evaluation.checkpoints_iterator('/non-existent-dir',
                                            timeout=0.1,
                                            timeout_fn=timeout_fn))
        self.assertEqual([], results)
        self.assertEqual(4, timeout_fn_calls[0])
Ejemplo n.º 6
0
def main(unused_argv):
    # pylint: disable=g-long-lambda
    if FLAGS.mode == "preprocess":
        prepare_dataset(FLAGS)
    elif FLAGS.mode == "train":
        print("Running training mode.")
        default_hparams = create_hparams(FLAGS)
        run_main(FLAGS, default_hparams, estimator.train_fn)
    elif FLAGS.mode == "train_and_eval":
        print("Running training and evaluation mode.")
        default_hparams = create_hparams(FLAGS)
        run_main(FLAGS, default_hparams,
                 estimator.train_and_eval_with_low_level_api)
    else:
        print("Running inference mode.")
        default_hparams = create_hparams(FLAGS)
        current_epoch = 0
        last_step = 0
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(FLAGS.out_dir):
            # Terminate eval job once target score is reached
            current_step = int(os.path.basename(ckpt).split("-")[1])
            if current_step <= last_step:
                continue
            last_step = current_step
            tf.logging.info("Starting to evaluate...%s", ckpt)
            try:
                score = run_main(FLAGS, default_hparams, estimator.eval_fn)
                current_epoch += 1
                if score > FLAGS.target_bleu:
                    tf.logging.info(
                        "Evaluation finished after training step %d" %
                        current_step)
                    break
                # Terminate eval job when final checkpoint is reached
                max_steps = default_hparams.num_train_steps
                if current_step >= max_steps:
                    tf.logging.info(
                        "Evaluation finished but failed to reach target score."
                    )
                    break

            except tf.errors.NotFoundError:
                tf.logging.info(
                    "Checkpoint %s no longer exists, skipping checkpoint" %
                    ckpt)
Ejemplo n.º 7
0
  def testReturnsSingleCheckpointIfOneCheckpointFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()
    saver = saver_lib.Saver()  # Saves the global step.

    with self.test_session() as session:
      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
Ejemplo n.º 8
0
  def testReturnsSingleCheckpointIfOneCheckpointFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()
    saver = saver_lib.Saver()  # Saves the global step.

    with self.test_session() as session:
      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
Ejemplo n.º 9
0
def evaluate(model_est, imagenet_eval, params):
    """Conducts eval and maybe export the model.
    Args:
        model_est: `TPUEstimator` instance for the discovered model
        imagenet_eval: Input pipeline for the validation set
        params: Dictionary containing parameters
    """
    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir,
                                                timeout=FLAGS.eval_timeout):
        tf.logging.info('Starting to evaluate.')
        try:
            start_timestamp = time.time(
            )  # This time will include compilation time
            eval_results = model_est.evaluate(input_fn=imagenet_eval.input_fn,
                                              steps=eval_steps,
                                              checkpoint_path=ckpt)
            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                            eval_results, elapsed_time)

            # Terminate eval job when final checkpoint is reached
            current_step = int(os.path.basename(ckpt).split('-')[1])
            if current_step >= FLAGS.train_steps:
                tf.logging.info('Evaluation finished after training step %d',
                                current_step)
                break

        except tf.errors.NotFoundError:
            # Since the coordinator is on a different job than the TPU worker,
            # sometimes the TPU worker does not finish initializing until long after
            # the CPU job tells it to start evaluating. In this case, the checkpoint
            # file could have been deleted already.
            tf.logging.info(
                'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

    if FLAGS.export_dir:
        export(model_est, FLAGS.export_dir)
Ejemplo n.º 10
0
def main(unused_argv):
    params = params_dict.ParamsDict(mnasnet_config.MNASNET_CFG,
                                    mnasnet_config.MNASNET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)

    params = flags_to_params.override_params_from_input_flags(params, FLAGS)

    additional_params = {
        'steps_per_epoch': params.num_train_images / params.train_batch_size,
        'quantized_training': FLAGS.quantized_training,
    }

    params = params_dict.override_params_dict(params,
                                              additional_params,
                                              is_strict=False)

    params.validate()
    params.lock()

    if FLAGS.tpu or params.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if params.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, params.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    # Validates Flags.
    if params.precision == 'bfloat16' and params.use_keras:
        raise ValueError(
            'Keras layers do not have full support to bfloat16 activation training.'
            ' You have set precision as %s and use_keras as %s' %
            (params.precision, params.use_keras))

    # Initializes model parameters.
    mnasnet_est = tf.contrib.tpu.TPUEstimator(
        use_tpu=params.use_tpu,
        model_fn=mnasnet_model_fn,
        config=config,
        train_batch_size=params.train_batch_size,
        eval_batch_size=params.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu,
        params=params.as_dict())

    if FLAGS.mode == 'export_only':
        export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
        return

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=False,
                transpose_input=params.transpose_input,
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params.transpose_input,
                cache=params.use_cache and is_training,
                image_size=params.input_image_size,
                num_parallel_calls=params.num_parallel_calls,
                use_bfloat16=(params.precision == 'bfloat16'))
            for is_training in [True, False]
        ]

    if FLAGS.mode == 'eval':
        eval_steps = params.num_eval_images // params.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = mnasnet_est.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

        if FLAGS.export_dir:
            export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(  # pylint: disable=protected-access
            FLAGS.model_dir)

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params.train_steps,
            params.train_steps / params.steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, params.iterations_per_loop)))
            mnasnet_est.train(input_fn=imagenet_train.input_fn,
                              max_steps=params.train_steps,
                              hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params.train_steps)
                mnasnet_est.train(input_fn=imagenet_train.input_fn,
                                  max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = mnasnet_est.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params.num_eval_images // params.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params.train_steps, elapsed_time)
            if FLAGS.export_dir:
                export(mnasnet_est, FLAGS.export_dir, params,
                       FLAGS.post_quantize)
Ejemplo n.º 11
0
def main(unused_argv):
    del unused_argv  # Unused

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    params = {
        'input_perm': [0, 1, 2, 3],
        'output_perm': [0, 1, 2, 3],
    }

    batch_axis = 0
    if FLAGS.transpose_enabled:
        params['input_perm'] = [3, 0, 1, 2]
        params['output_perm'] = [1, 2, 3, 0]
        batch_axis = 3

    if FLAGS.eval_total_size > 0:
        eval_size = FLAGS.eval_total_size
    else:
        eval_size = _NUM_EVAL_IMAGES
    eval_steps = eval_size // FLAGS.eval_batch_size

    iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations)

    eval_batch_size = (None
                       if FLAGS.mode == 'train' else FLAGS.eval_batch_size)

    tpu_config = contrib_tpu.TPUConfig(iterations_per_loop=iterations,
                                       num_shards=FLAGS.num_shards)

    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tpu_config)

    inception_classifier = contrib_tpu.TPUEstimator(
        model_fn=inception_model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params=params,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=eval_batch_size,
        batch_axis=(batch_axis, 0))

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir)
    imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir)

    if FLAGS.moving_average:
        eval_hooks = [LoadEMAHook(FLAGS.model_dir)]
    else:
        eval_hooks = []

    if FLAGS.mode == 'eval':
        # Run evaluation when there is a new checkpoint
        for checkpoint in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time()  # Includes compilation time
                eval_results = inception_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(checkpoint).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break
            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    checkpoint)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
            tf.logging.info('Starting training cycle %d.' % cycle)
            inception_classifier.train(input_fn=imagenet_train.input_fn,
                                       steps=FLAGS.train_steps_per_eval)

            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Starting training ...')
        inception_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=FLAGS.train_steps)

    if FLAGS.export_dir is not None:
        tf.logging.info('Starting to export model.')
        inception_classifier.export_saved_model(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=image_serving_input_fn)
Ejemplo n.º 12
0
params = dict(weight_decay=args.weight_decay)
tpu_estimator = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn,
    config=run_config,
    train_batch_size=args.batch_size,
    eval_batch_size=args.batch_size,
    params=params)

hooks = []
hooks.append(
    async_checkpoint.AsyncCheckpointSaverHook(
        checkpoint_dir=args.model_dir,
        save_steps=iterations_per_loop))

train_input_fn = make_input_fn(data, labels)
eval_input_fn = make_input_fn(test_data, test_labels)

if pid > 0:
    tpu_estimator.train(input_fn=train_input_fn,
                        steps=args.num_epochs * steps_per_epoch,
                        hooks=hooks)
    # Sleep so that eval can finish before closing.
    time.sleep(360)
else:
    for ckpt in evaluation.checkpoints_iterator(args.model_dir):
        eval_results = tpu_estimator.evaluate(
            input_fn=eval_input_fn,
            steps=len(test_data) // args.batch_size,
            checkpoint_path=ckpt)
        print("Eval results: %s" % eval_results)
Ejemplo n.º 13
0
def main(unused_argv):
    tpu_grpc_url = None
    tpu_cluster_resolver = None
    if FLAGS.use_tpu:
        # Determine the gRPC URL of the TPU device to use
        if not FLAGS.master and not FLAGS.tpu_name:
            raise RuntimeError(
                'You must specify either --master or --tpu_name.')

        if FLAGS.master:
            if FLAGS.tpu_name:
                tf.logging.warn(
                    'Both --master and --tpu_name are set. Ignoring'
                    ' --tpu_name and using --master.')
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
    else:
        # URL is unused if running locally without TPU
        tpu_grpc_url = None

    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        cluster=tpu_cluster_resolver,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores))

    resnet_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = imagenet_input.ImageNetInput(is_training=True,
                                                  data_dir=FLAGS.data_dir)
    imagenet_eval = imagenet_input.ImageNetInput(is_training=False,
                                                 data_dir=FLAGS.data_dir)

    if FLAGS.mode == 'eval':
        eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                        ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps /
                                       batches_per_epoch, current_step))

        start_timestamp = time.time(
        )  # This time will include compilation time
        if FLAGS.mode == 'train':
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=FLAGS.train_steps)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be consistently excluded modulo the batch size.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info(
            'Finished training up to step %d. Elapsed seconds %d.' %
            (FLAGS.train_steps, elapsed_time))

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            resnet_classifier.export_savedmodel(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Ejemplo n.º 14
0
 def get_next_checkpoint():
     return evaluation.checkpoints_iterator(
         FLAGS.model_dir,
         min_interval_secs=FLAGS.min_eval_interval,
         timeout=FLAGS.eval_timeout,
         timeout_fn=terminate_eval)
Ejemplo n.º 15
0
def _get_next_checkpoint():
    return evaluation.checkpoints_iterator(FLAGS.model_dir,
                                           timeout=60 * 60 * 24,
                                           timeout_fn=_terminate_eval)
Ejemplo n.º 16
0
def main(argv):
  del argv  # Unused.

  if FLAGS.use_tpu:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu,
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)
    tpu_grpc_url = tpu_cluster_resolver.get_master()
    tf.Session.reset(tpu_grpc_url)
  else:
    tpu_cluster_resolver = None

  if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None:
    raise RuntimeError('You must specify --training_file_pattern for training.')
  if FLAGS.mode is 'eval':
    if FLAGS.valid_data_dir is None:
      raise RuntimeError('You must specify --valid_data_dir for evaluation.')
    if FLAGS.val_json_file is None:
      raise RuntimeError('You must specify --val_json_file for evaluation.')

  # Parse hparams
  hparams = retinanet_model.default_hparams()
  hparams.parse(FLAGS.hparams)

  params = dict(
      hparams.values(),
      num_shards=FLAGS.num_shards,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      use_tpu=FLAGS.use_tpu,
      resnet_checkpoint=FLAGS.resnet_checkpoint,
      val_json_file=FLAGS.val_json_file,
      mode=FLAGS.mode,
  )
  config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False)
  if FLAGS.use_xla and not FLAGS.use_tpu:
    config_proto.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)

  run_config = tpu_config.RunConfig(
      cluster=tpu_cluster_resolver,
      evaluation_master=FLAGS.eval_master,
      model_dir=FLAGS.model_dir,
      log_step_count_steps=FLAGS.iterations_per_loop,
      session_config=config_proto,
      tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop,
                                      FLAGS.num_shards))

  # TPU Estimator
  if FLAGS.mode == 'train':
    train_estimator = tpu_estimator.TPUEstimator(
        model_fn=retinanet_model.retinanet_model_fn,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        config=run_config,
        params=params)
    train_estimator.train(
        input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                        is_training=True),
        max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                      FLAGS.train_batch_size))

    if FLAGS.eval_after_training:
      # Run evaluation after training finishes.
      eval_params = dict(
          params,
          use_tpu=False,
          input_rand_hflip=False,
          skip_crowd=False,
          resnet_checkpoint=None,
          is_training_bn=False,
          use_bfloat16=False,
      )
      eval_estimator = tpu_estimator.TPUEstimator(
          model_fn=retinanet_model.retinanet_model_fn,
          use_tpu=False,
          train_batch_size=FLAGS.train_batch_size,
          eval_batch_size=1,
          config=run_config,
          params=eval_params)
      eval_results = eval_estimator.evaluate(
          input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                          is_training=False),
          steps=FLAGS.eval_steps)
      tf.logging.info('Eval results: %s' % eval_results)

  elif FLAGS.mode == 'eval':
    # eval only runs on CPU or GPU host with batch_size = 1

    # Override the default options: disable randomization in the input pipeline
    # and don't run on the TPU.
    eval_params = dict(
        params,
        use_tpu=False,
        input_rand_hflip=False,
        skip_crowd=False,
        resnet_checkpoint=None,
        is_training_bn=False,
        use_bfloat16=False,
    )

    eval_estimator = tpu_estimator.TPUEstimator(
        model_fn=retinanet_model.retinanet_model_fn,
        use_tpu=False,
        eval_batch_size=1,
        train_batch_size=FLAGS.train_batch_size,
        config=run_config,
        params=eval_params)

    def terminate_eval():
      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
                      FLAGS.eval_timeout)
      return True

    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval,
        timeout=FLAGS.eval_timeout,
        timeout_fn=terminate_eval):

      tf.logging.info('Starting to evaluate.')
      try:
        eval_results = eval_estimator.evaluate(
            input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                            is_training=False),
            steps=FLAGS.eval_steps)
        tf.logging.info('Eval results: %s' % eval_results)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                         FLAGS.train_batch_size)
        if current_step >= total_step:
          tf.logging.info('Evaluation finished after training step %d' %
                          current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' %
                        ckpt)
  else:
    tf.logging.info('Mode not found.')
Ejemplo n.º 17
0
def main(unused_argv):
    if FLAGS.use_tpu:
        # Determine the gRPC URL of the TPU device to use
        if FLAGS.master is None and FLAGS.tpu_name is None:
            raise RuntimeError(
                'You must specify either --master or --tpu_name.')

        if FLAGS.master is not None:
            if FLAGS.tpu_name is not None:
                tf.logging.warn(
                    'Both --master and --tpu_name are set. Ignoring'
                    ' --tpu_name and using --master.')
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
            tpu_grpc_url = tpu_cluster_resolver.get_master()
    else:
        # URL is unused if running locally without TPU
        tpu_grpc_url = None

    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=FLAGS.iterations_per_loop,
        keep_checkpoint_max=5,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            per_host_input_for_training=tpu_config.InputPipelineConfig.
            PER_HOST_V2))

    resnet_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = imagenet_input.ImageNetInput(
        is_training=True,
        data_dir=FLAGS.data_dir,
        num_parallel_calls=FLAGS.num_parallel_calls,
        use_transpose=FLAGS.use_transpose)
    imagenet_eval = imagenet_input.ImageNetInput(
        is_training=False,
        data_dir=FLAGS.data_dir,
        num_parallel_calls=FLAGS.num_parallel_calls,
        use_transpose=FLAGS.use_transpose)

    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
    steps_per_epoch = NUM_TRAIN_IMAGES // FLAGS.train_batch_size
    start_timestamp = time.time()
    current_epoch = current_step // steps_per_epoch

    if FLAGS.mode == 'train':
        resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                max_steps=FLAGS.train_steps)
        training_time = time.time() - start_timestamp
        tf.logging.info('Finished training in %d seconds' % training_time)

        with tf.gfile.GFile(FLAGS.model_dir + '/total_time_%s.txt' % training_time, 'w') as f:  # pylint: disable=line-too-long
            f.write('Total training time was %s seconds' % training_time)

    elif FLAGS.mode == 'eval':
        results = []

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                                (eval_results, elapsed_time))

                current_step = int(os.path.basename(ckpt).split('-')[1])
                current_epoch = current_step // steps_per_epoch
                results.append([
                    current_epoch,
                    '{0:.2f}'.format(eval_results['top_1_accuracy'] * 100),
                    '{0:.2f}'.format(eval_results['top_5_accuracy'] * 100),
                ])

                # Terminate eval job when final checkpoint is reached
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

        with tf.gfile.GFile(FLAGS.model_dir + '/epoch_results_eval.tsv', 'wb') as tsv_file:  # pylint: disable=line-too-long
            writer = csv.writer(tsv_file, delimiter='\t')
            writer.writerow(['epoch', 'top1Accuracy', 'top5Accuracy'])
            writer.writerows(results)

    elif FLAGS.mode == 'train_and_eval':
        results = []
        while current_epoch < 95:
            next_checkpoint = (current_epoch + 1) * steps_per_epoch
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=next_checkpoint)
            current_epoch += 1

            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.' %
                (next_checkpoint, int(time.time() - start_timestamp)))

            # Evaluate the model on the most recent model in --model_dir.
            # Since evaluation happens in batches of --eval_batch_size, some images
            # may be excluded modulo the batch size. As long as the batch size is
            # consistent, the evaluated images are also consistent.
            tf.logging.info('Starting to evaluate.')
            eval_results = resnet_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
            tf.logging.info('Eval results: %s' % eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Finished epoch %s at %s time' %
                            (current_epoch, elapsed_time))
            results.append([
                current_epoch,
                elapsed_time / 3600.0,
                '{0:.2f}'.format(eval_results['top_1_accuracy'] * 100),
                '{0:.2f}'.format(eval_results['top_5_accuracy'] * 100),
            ])

        with tf.gfile.GFile(FLAGS.model_dir + '/epoch_results_train_eval.tsv', 'wb') as tsv_file:  # pylint: disable=line-too-long
            writer = csv.writer(tsv_file, delimiter='\t')
            writer.writerow(['epoch', 'hours', 'top1Accuracy', 'top5Accuracy'])
            writer.writerows(results)
    else:
        tf.logging.info('Mode not found.')

    if FLAGS.export_dir is not None:
        # The guide to serve a exported TensorFlow model is at:
        #    https://www.tensorflow.org/serving/serving_basic
        tf.logging.info('Starting to export model.')
        resnet_classifier.export_savedmodel(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
Ejemplo n.º 18
0
def main(_):

    if FLAGS.pruning_method in ['threshold']:
        folder_stub = os.path.join(FLAGS.pruning_method,
                                   str(FLAGS.end_sparsity),
                                   str(FLAGS.sparsity_begin_step),
                                   str(FLAGS.sparsity_end_step),
                                   str(FLAGS.pruning_frequency),
                                   str(FLAGS.label_smoothing))
    elif FLAGS.pruning_method == 'variational_dropout':
        folder_stub = os.path.join(FLAGS.pruning_method,
                                   str(FLAGS.sparsity_begin_step),
                                   str(FLAGS.sparsity_end_step),
                                   str(FLAGS.reg_scalar),
                                   str(FLAGS.label_smoothing))
    elif FLAGS.pruning_method == 'l0_regularization':
        folder_stub = os.path.join(FLAGS.pruning_method,
                                   str(FLAGS.sparsity_begin_step),
                                   str(FLAGS.sparsity_end_step),
                                   str(FLAGS.reg_scalar),
                                   str(FLAGS.label_smoothing))
    elif FLAGS.pruning_method == 'baseline':
        folder_stub = os.path.join(FLAGS.pruning_method, str(0.0), str(0.0),
                                   str(0.0), str(0.0))
    elif FLAGS.pruning_method == 'scratch':
        run_info = FLAGS.load_mask_dir.split('/')
        run_type = run_info[10]
        run_sparsity = run_info[11]
        run_begin = run_info[12]
        run_end = run_info[13]
        run_freq = run_info[14]
        run_label_smoothing = run_info[15]
        folder_stub = os.path.join(FLAGS.pruning_method, run_type,
                                   run_sparsity, run_begin, run_end, run_freq,
                                   run_label_smoothing, FLAGS.init_method)
    else:
        raise ValueError('Pruning method is not known %s' %
                         (FLAGS.pruning_method))

    output_dir = os.path.join(FLAGS.output_dir, folder_stub)

    export_dir = os.path.join(output_dir, 'export_dir')

    # we pass the updated eval and train string to the params dictionary.
    params = {}
    params['output_dir'] = output_dir
    params['pruning_method'] = FLAGS.pruning_method
    params['use_tpu'] = FLAGS.use_tpu
    params['log_alpha_threshold'] = FLAGS.log_alpha_threshold

    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(  # pylint: disable=g-complex-comprehension
            is_training=is_training,
            data_dir=FLAGS.data_directory,
            transpose_input=False,
            num_parallel_calls=FLAGS.num_parallel_calls,
            use_bfloat16=False) for is_training in [True, False]
    ]

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=output_dir,
        save_checkpoints_steps=FLAGS.steps_per_checkpoint,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            tpu_job_name=FLAGS.tpu_job_name))

    classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    cpu_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        export_to_tpu=False,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0:
        raise ValueError(
            'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' %
            (FLAGS.eval_batch_size, FLAGS.num_eval_images))

    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size

    if FLAGS.mode == 'eval_once':
        ckpt = FLAGS.output_dir + 'model.ckpt-{}'.format(FLAGS.checkpoint_step)
        classifier.evaluate(input_fn=imagenet_eval.input_fn,
                            steps=eval_steps,
                            checkpoint_path=ckpt,
                            name='{0}'.format(
                                int(FLAGS.log_alpha_threshold * 10)))
    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(output_dir):
            print('Starting to evaluate.')
            try:
                classifier.evaluate(input_fn=imagenet_eval.input_fn,
                                    steps=eval_steps,
                                    checkpoint_path=ckpt,
                                    name='{0}'.format(
                                        int(FLAGS.log_alpha_threshold * 10)))
                # Terminate eval job when final checkpoint is reached
                global_step = int(os.path.basename(ckpt).split('-')[1])
                if global_step >= FLAGS.train_steps:
                    print('Evaluation finished after training step %d' %
                          global_step)
                    break

            except tf.errors.NotFoundError:
                logging('Checkpoint no longer exists,skipping checkpoint.')

    else:
        global_step = tf.estimator._load_global_step_from_checkpoint_dir(
            output_dir)  # pylint: disable=protected-access,line-too-long
        # Session run hooks to export model for prediction
        export_hook = ExportModelHook(cpu_classifier, export_dir)
        hooks = [export_hook]

        if FLAGS.mode == 'train':
            print('start training...')
            classifier.train(input_fn=imagenet_train.input_fn,
                             hooks=hooks,
                             max_steps=FLAGS.train_steps)
        else:
            assert FLAGS.mode == 'train_and_eval'
            print('start training and eval...')
            while global_step < FLAGS.train_steps:
                next_checkpoint = min(global_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                classifier.train(input_fn=imagenet_train.input_fn,
                                 max_steps=next_checkpoint)
                global_step = next_checkpoint
                logging('Completed training up to step :', global_step)
                classifier.evaluate(input_fn=imagenet_eval.input_fn,
                                    steps=eval_steps)
Ejemplo n.º 19
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  # RevNet specific configuration
  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)

  if FLAGS.use_tpu:
    tf.logging.info("Using TPU.")
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
  else:
    tpu_cluster_resolver = None

  # TPU specific configuration
  tpu_config = tf.contrib.tpu.TPUConfig(
      # Recommended to be set as number of global steps for next checkpoint
      iterations_per_loop=FLAGS.iterations_per_loop,
      num_shards=FLAGS.num_shards)

  # Estimator specific configuration
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=False),
      tpu_config=tpu_config,
  )

  # Construct TPU Estimator
  estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=config.tpu_batch_size,
      eval_batch_size=config.tpu_eval_batch_size,
      config=run_config,
      params={"config": config})

  # Construct input functions
  train_input_fn = get_input_fn(
      config=config, data_dir=FLAGS.data_dir, split="train_all")
  eval_input_fn = get_input_fn(
      config=config, data_dir=FLAGS.data_dir, split="test")

  # Disabling a range within an else block currently doesn't work
  # due to https://github.com/PyCQA/pylint/issues/872
  # pylint: disable=protected-access
  if FLAGS.mode == "eval":
    # TPUEstimator.evaluate *requires* a steps argument.
    # Note that the number of examples used during evaluation is
    # --eval_steps * --batch_size.
    # So if you change --batch_size then change --eval_steps too.
    eval_steps = 10000 // config.tpu_eval_batch_size

    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info("Starting to evaluate.")
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = estimator.evaluate(
            input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                        (eval_results, elapsed_time))

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split("-")[1])
        if current_step >= config.max_train_iter:
          tf.logging.info(
              "Evaluation finished after training step %d" % current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)

  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator_._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)
    tf.logging.info("Training for %d steps . Current"
                    " step %d." % (config.max_train_iter, current_step))

    start_timestamp = time.time()  # This time will include compilation time
    if FLAGS.mode == "train":
      estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter)
    else:
      eval_steps = 10000 // config.tpu_eval_batch_size
      assert FLAGS.mode == "train_and_eval"
      while current_step < config.max_train_iter:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              config.max_train_iter)
        estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be consistently excluded modulo the batch size.
        tf.logging.info("Starting to evaluate.")
        eval_results = estimator.evaluate(
            input_fn=eval_input_fn, steps=eval_steps)
        tf.logging.info("Eval results: %s" % eval_results)

    elapsed_time = int(time.time() - start_timestamp)
    tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                    (config.max_train_iter, elapsed_time))
Ejemplo n.º 20
0
 def testMonitorCheckpointsLoopTimeout(self):
   ret = list(
       evaluation_lib.checkpoints_iterator(
           '/non-existent-dir', timeout=0))
   self.assertEqual(ret, [])
Ejemplo n.º 21
0
def wait_for_checkpoint(path):
    from tensorflow.contrib.training.python.training import evaluation
    return evaluation.checkpoints_iterator(path)
def main(unused_argv):
  tf.flags.mark_flag_as_required('model_dir')
  tf.flags.mark_flag_as_required('pipeline_config_path')

  if FLAGS.master is None and FLAGS.tpu_name is None:
    raise RuntimeError('You must specify either --master or --tpu_name.')

  if FLAGS.master is not None:
    if FLAGS.tpu_name is not None:
      tf.logging.warn('Both --master and --tpu_name are set. Ignoring '
                      '--tpu_name and using --master.')
    tpu_grpc_url = FLAGS.master
  else:
    tpu_cluster_resolver = (
        tf.contrib.cluster_resolver.python.training.TPUClusterResolver(
            tpu_names=[FLAGS.tpu_name],
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project))
    tpu_grpc_url = tpu_cluster_resolver.get_master()

  config = tpu_config.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))
  params = {}
  estimator, train_input_fn, eval_input_fn, train_steps, eval_steps = (
      create_estimator(
          config,
          model_hparams.create_hparams(),
          FLAGS.pipeline_config_path,
          train_steps=FLAGS.num_train_steps,
          eval_steps=FLAGS.num_eval_steps,
          train_batch_size=FLAGS.train_batch_size,
          use_tpu=FLAGS.use_tpu,
          num_shards=FLAGS.num_shards,
          params=params))

  if FLAGS.mode in ['train', 'train_and_eval']:
    estimator.train(input_fn=train_input_fn, max_steps=train_steps)

  if FLAGS.mode == 'train_and_eval':
    # Eval one time.
    eval_results = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
    tf.logging.info('Eval results: %s' % eval_results)

  # Continuously evaluating.
  if FLAGS.mode == 'eval':
    def terminate_eval():
      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
                      FLAGS.eval_timeout_secs)
      return True

    # Run evaluation when there's a new checkpoint.
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval_secs,
        timeout=FLAGS.eval_timeout_secs,
        timeout_fn=terminate_eval):

      tf.logging.info('Starting to evaluate.')
      try:
        eval_results = estimator.evaluate(
            input_fn=eval_input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        tf.logging.info('Eval results: %s' % eval_results)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d' % current_step)
          break

      except tf.errors.NotFoundError:
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)
Ejemplo n.º 23
0
def main(unused_argv):
  # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
  #     FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '',
  #     zone=FLAGS.tpu_zone,
  #     project=FLAGS.gcp_project)

  if FLAGS.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)

  NUM_GPUS = len(get_available_gpus())
  distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS)
  gpu_options = tf.GPUOptions(allow_growth=True)

  # config = tf.contrib.tpu.RunConfig(
  #     cluster=tpu_cluster_resolver,
  #     model_dir=FLAGS.model_dir,
  #     save_checkpoints_steps=save_checkpoints_steps,
  #     log_step_count_steps=FLAGS.log_step_count_steps,
  #     session_config=tf.ConfigProto(
  #         graph_options=tf.GraphOptions(
  #             rewrite_options=rewriter_config_pb2.RewriterConfig(
  #                 disable_meta_optimizer=True))),
  #     tpu_config=tf.contrib.tpu.TPUConfig(
  #         iterations_per_loop=FLAGS.iterations_per_loop,
  #         per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
  #         .PER_HOST_V2))  # pylint: disable=line-too-long

  config = tf.estimator.RunConfig(
      # cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      session_config=tf.ConfigProto(allow_soft_placement=True,
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True)), gpu_options=gpu_options),
      train_distribute=distribution,
      # tpu_config=tf.contrib.tpu.TPUConfig(
      #     iterations_per_loop=FLAGS.iterations_per_loop,
      #     per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
      #     .PER_HOST_V2)
  )
  # Initializes model parameters.
  # params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size)
  # model_est = tf.estimator.Estimator(
  #     use_tpu=FLAGS.use_tpu,
  #     model_fn=final_model_fn,
  #     config=config,
  #     train_batch_size=FLAGS.train_batch_size,
  #     eval_batch_size=FLAGS.eval_batch_size,
  #     export_to_tpu=FLAGS.export_to_tpu,
  #     params=params)
  params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, batch_size=FLAGS.train_batch_size)
  model_est = tf.estimator.Estimator(
      model_fn=final_model_fn,
      config=config,
      params=params)

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput(
        is_training=is_training,
        use_bfloat16=False,
        transpose_input=FLAGS.transpose_input,
        selection=selection) for (is_training, selection) in
                                     [(True, select_train),
                                      (False, select_eval)]]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=FLAGS.transpose_input,
            cache=FLAGS.use_cache and is_training,
            image_size=FLAGS.input_image_size,
            num_parallel_calls=FLAGS.num_parallel_calls,
            use_bfloat16=False) for is_training in [True, False]
    ]

  if FLAGS.mode == 'eval':
    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = model_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                        eval_results, elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= FLAGS.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

    if FLAGS.export_dir:
      export(model_est, FLAGS.export_dir, FLAGS.post_quantize)
  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', FLAGS.train_steps,
        FLAGS.train_steps / params['steps_per_epoch'], current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if FLAGS.use_async_checkpointing:
        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(100, FLAGS.iterations_per_loop)))
      model_est.train(
          input_fn=imagenet_train.input_fn,
          max_steps=FLAGS.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < FLAGS.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              FLAGS.train_steps)
        model_est.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = model_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
        tf.logging.info('Eval results at step %d: %s',
                        next_checkpoint, eval_results)

      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      FLAGS.train_steps, elapsed_time)
      if FLAGS.export_dir:
        export(model_est, FLAGS.export_dir, FLAGS.post_quantize)
Ejemplo n.º 24
0
def main(unused_argv):

    model_config.show_info()
    train_config.show_info()
    preproc_config.show_info()

    ## ckpt dir create
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    curr_model_dir      = "{}/run-{}/".format(FLAGS.model_dir, now)
    curr_model_dir_local= "{}/run-{}/".format(EXPORT_MODEL_DIR,now)

    tf.logging.info('[main] data dir = %s'%FLAGS.data_dir)
    tf.logging.info('[main] model dir = %s'%curr_model_dir)
    tf.logging.info('[main] config logging dir  = %s'%curr_model_dir_local)
    tf.logging.info('------------------------')

    if not tf.gfile.Exists(curr_model_dir):
        tf.gfile.MakeDirs(curr_model_dir)

    if not tf.gfile.Exists(curr_model_dir_local):
        tf.gfile.MakeDirs(curr_model_dir_local)

    FLAGS.model_dir = curr_model_dir

    # # logging config information
    tf.logging.info(str(train_config_dict))
    tf.logging.info(str(model_config_dict))
    tf.logging.info(str(preproc_config_dict))

    train_config_filename   = curr_model_dir_local + 'train_config' + '.json'
    model_config_filename   = curr_model_dir_local + 'model_config' + '.json'
    preproc_config_filename = curr_model_dir_local + 'preproc_config' + '.json'

    with open(train_config_filename, 'w') as fp:
        json.dump(str(train_config_dict), fp)

    with open(model_config_filename, 'w') as fp:
        json.dump(str(model_config_dict), fp)

    with open(preproc_config_filename, 'w') as fp:
        json.dump(str(preproc_config_dict), fp)


    try:
        cmd = "sudo gsutil cp -r {} {}".format(curr_model_dir_local + '* ', curr_model_dir)
        print ('[main] cmd=%s'%cmd)
        check_output(cmd,shell=True)
        tf.logging.info('[main] success logging config in bucket')
    except:
        tf.logging.info('[main] failure logging config in bucket')


    # for CPU or GPU use
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False,
                            gpu_options=tf.GPUOptions(allow_growth=True))


    config = tf.estimator.RunConfig(
                model_dir                       =FLAGS.model_dir,
                tf_random_seed                  =None,
                save_summary_steps              =FLAGS.summary_step,
                save_checkpoints_steps          =max(600, FLAGS.iterations_per_loop),
                session_config                  = config,
                keep_checkpoint_max             =5,
                keep_checkpoint_every_n_hours   =10000,
                log_step_count_steps            =FLAGS.log_step_count_steps,
                train_distribute                =None)

    dontbeturtle_estimator  = tf.estimator.Estimator(
                model_dir          = FLAGS.model_dir,
                model_fn           = model_fn,
                config             = config,
                params             = None,
                warm_start_from    = None)

    '''
    # data loader
    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    '''
    dataset_train, dataset_eval = \
        [data_loader_coco.DataSetInput(
        is_training     =is_training,
        data_dir        =FLAGS.data_dir,
        transpose_input =FLAGS.transpose_input,
        use_bfloat16    =False) for is_training in [True, False]]



    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')

            try:
                start_timestamp = time.time()  # This time will include compilation time
                eval_results = dontbeturtle_estimator.evaluate(
                    input_fn        =dataset_eval.input_fn,
                    steps           =eval_steps,
                    checkpoint_path =ckpt)

                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                      'Evaluation finished after training step %d' % current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the GPU worker,
                # sometimes the GPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)

    else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        batchnum_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size

        tf.logging.info('[main] num_train_images=%s' % FLAGS.num_train_images)
        tf.logging.info('[main] train_batch_size=%s' % FLAGS.train_batch_size)
        tf.logging.info('[main] batchnum_per_epoch=%s' % batchnum_per_epoch)
        tf.logging.info('[main] Training for %d steps (%.2f epochs in total). Current'
                        ' step %d.' % (FLAGS.train_steps,
                                       FLAGS.train_steps / batchnum_per_epoch,
                                       current_step))

        start_timestamp = time.time()  # This time will include compilation time

        if FLAGS.mode == 'train':
            dontbeturtle_estimator.train(
                input_fn    =dataset_train.input_fn,
                max_steps   =FLAGS.train_steps)
            tf.logging.info('[main] Training only')

        else:
            assert FLAGS.mode == 'train_and_eval'
            tf.logging.info('[main] Training and Evaluation')

            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                dontbeturtle_estimator.train(
                    input_fn    =dataset_train.input_fn,
                    max_steps   =next_checkpoint)

                current_step = next_checkpoint

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be consistently excluded modulo the batch size.
                tf.logging.info('Starting to evaluate.')
                eval_results    = dontbeturtle_estimator.evaluate(
                    input_fn    =dataset_eval.input_fn,
                    steps       =FLAGS.num_eval_images // FLAGS.eval_batch_size)

                tf.logging.info('Eval results: %s' % eval_results)

                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' %
                        (FLAGS.train_steps, elapsed_time))
Ejemplo n.º 25
0
def main(unused_argv):
    params = params_dict.ParamsDict(resnet_config.RESNET_CFG,
                                    resnet_config.RESNET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)

    params = flags_to_params.override_params_from_input_flags(params, FLAGS)

    params.validate()
    params.lock()

    # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
    #     FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '',
    #     zone=FLAGS.tpu_zone,
    #     project=FLAGS.gcp_project)

    tpu_address = ''

    if 'COLAB_TPU_ADDR' not in os.environ:
        print(
            'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
        )
    else:
        tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']

        with tf.Session(tpu_address) as sess:
            with open('/content/adc.json', 'r') as f:
                auth_info = json.load(f)

            tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)

    if params.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(5000, params.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        # cluster=tpu_cluster_resolver,
        master=tpu_address,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params.iterations_per_loop,
            num_shards=params.num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    warm_start_settings = None

    if FLAGS.warm_start_from:
        warm_start_settings = tf.estimator.WarmStartSettings(
            FLAGS.warm_start_from, vars_to_warm_start='^(?!.*dense)')

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=params.use_tpu,
        warm_start_from=warm_start_settings,
        model_fn=resnet_model_fn,
        config=config,
        params=params.as_dict(),
        train_batch_size=params.train_batch_size,
        eval_batch_size=params.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu)

    assert (params.precision == 'bfloat16' or params.precision
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params.precision)
    use_bfloat16 = params.precision == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=use_bfloat16,
                transpose_input=params.transpose_input,
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params.transpose_input,
                cache=params.use_cache and is_training,
                image_size=params.image_size,
                num_parallel_calls=params.num_parallel_calls,
                include_background_label=(params.num_label_classes == 1001),
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params.num_train_images // params.train_batch_size
    eval_steps = params.num_eval_images // params.eval_batch_size

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = params.num_train_images // params.train_batch_size
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params.train_steps,
            params.train_steps / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(5000, params.iterations_per_loop)))
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params.train_steps,
                                    hooks=hooks)

        elif FLAGS.mode == 'train_and_eval':
            while current_step < params.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params.train_steps)
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=int(next_checkpoint))
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params.num_eval_images // params.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params.train_steps, elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
            if FLAGS.add_warmup_requests:
                inference_warmup.write_warmup_requests(
                    export_path,
                    FLAGS.model_name,
                    params.image_size,
                    batch_sizes=FLAGS.inference_batch_sizes,
                    image_format='JPEG')
Ejemplo n.º 26
0
def main(unused_argv):
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=max(600, FLAGS.iterations_per_loop),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tpu_estimator.TPUEstimator(
        export_to_tpu=False,
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', FLAGS.precision)
    use_bfloat16 = FLAGS.precision == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(is_training=is_training,
                                     data_dir=FLAGS.data_dir,
                                     transpose_input=FLAGS.transpose_input,
                                     use_bfloat16=use_bfloat16)
        for is_training in [True, False]
    ]

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        batches_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size
        tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                        ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps /
                                       batches_per_epoch, current_step))

        start_timestamp = time.time(
        )  # This time will include compilation time
        if FLAGS.mode == 'train':
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=FLAGS.train_steps)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be consistently excluded modulo the batch size.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info(
            'Finished training up to step %d. Elapsed seconds %d.' %
            (FLAGS.train_steps, elapsed_time))

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            resnet_classifier.export_savedmodel(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Ejemplo n.º 27
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  # RevNet specific configuration
  revnet_config = {
      "revnet-56": config_.get_hparams_imagenet_56(),
      "revnet-104": config_.get_hparams_imagenet_104()
  }[FLAGS.revnet_config]

  if FLAGS.use_tpu:
    revnet_config.data_format = "channels_last"

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  # Estimator specific configuration
  config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
          PER_HOST_V2),
  )

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  imagenet_train, imagenet_eval = [
      imagenet_input.ImageNetInput(
          is_training=is_training,
          data_dir=FLAGS.data_dir,
          transpose_input=FLAGS.transpose_input,
          use_bfloat16=False) for is_training in [True, False]
  ]

  revnet_classifier = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=revnet_config.tpu_batch_size,
      eval_batch_size=revnet_config.tpu_eval_batch_size,
      config=config,
      export_to_tpu=False,
      params={"revnet_config": revnet_config})

  steps_per_epoch = revnet_config.tpu_iters_per_epoch
  eval_steps = revnet_config.tpu_eval_steps

  # pylint: disable=protected-access
  if FLAGS.mode == "eval":
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info("Starting to evaluate.")
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = revnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                        (eval_results, elapsed_time))

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split("-")[1])
        if current_step >= revnet_config.max_train_iter:
          tf.logging.info(
              "Evaluation finished after training step %d" % current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)

  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)

    tf.logging.info(
        "Training for %d steps (%.2f epochs in total). Current"
        " step %d." % (revnet_config.max_train_iter,
                       revnet_config.max_train_iter / steps_per_epoch,
                       current_step))

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == "train":
      revnet_classifier.train(
          input_fn=imagenet_train.input_fn,
          max_steps=revnet_config.max_train_iter)

    else:
      assert FLAGS.mode == "train_and_eval"
      while current_step < revnet_config.max_train_iter:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              revnet_config.max_train_iter)
        revnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                        (next_checkpoint, int(time.time() - start_timestamp)))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info("Starting to evaluate.")
        eval_results = revnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn, steps=eval_steps)
        tf.logging.info("Eval results: %s" % eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                        (revnet_config.max_train_iter, elapsed_time))

    if FLAGS.export_dir is not None:
      # The guide to serve an exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info("Starting to export model.")
      revnet_classifier.export_savedmodel(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
def main(_):

    config = process_config(FLAGS.config_path)
    print(config)

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        deploy_config = deploy.DeploymentConfig(num_clones=1)

        global_step = tf.Variable(0, trainable=False, name='global_step')

        # select model and build net
        net = tdr2n2.Unet(config)

        # create batch dataset
        with tf.device(deploy_config.inputs_device()):
            data = DataGenerator(config.input)

            x_test, y_test = data.get_eval_data()
            x_test = tf.expand_dims(x_test, -1)
            x_test.set_shape([
                None, config.input.img_out_shape[0],
                config.input.img_out_shape[1], config.input.img_out_shape[2]
            ])
            y_test.set_shape([
                None, config.input.mask_out_shape[0],
                config.input.mask_out_shape[1]
            ])
            y_test = tf.cast(y_test, tf.int32)
            y_test_hot = tf.one_hot(y_test,
                                    depth=config.network.num_classes,
                                    axis=-1)

        f_score, end_points = net.net(x_test)
        f_score_img = tf.expand_dims(
            tf.cast(tf.argmax(f_score, axis=-1), tf.float32) * 50., -1)
        y_test_img = tf.expand_dims(
            tf.cast(tf.argmax(y_test_hot, axis=-1), tf.float32) * 50., -1)

        ## add precision and recall
        f_score = tf.cast(tf.argmax(f_score, -1), tf.int32)
        #f_score = tf.image.resize_bilinear(f_score, (config.input.img_out_shape[0]))
        f_score = tf.one_hot(f_score,
                             depth=config.network.num_classes,
                             axis=-1)
        pred = tf.reduce_sum(f_score * y_test_hot, axis=(0, 1, 2))
        all_pred = tf.reduce_sum(f_score, axis=(0, 1, 2)) + 1e-5
        all_true = tf.reduce_sum(y_test_hot, axis=(0, 1, 2)) + 1e-5

        # Variables to restore: moving avg. or normal weights.
        if config.train.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                config.train.moving_average_decay, global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[global_step.op.name] = global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        saver = None
        if variables_to_restore is not None:
            saver = tf_saver.Saver(variables_to_restore)

        # =================================================================== #
        # Evaluation loop.
        # =================================================================== #
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=config.deploy.gpu_memory_fraction)
        configproto = tf.ConfigProto(
            gpu_options=gpu_options,
            log_device_placement=False,
            allow_soft_placement=True,
        )

        merged = tf.summary.merge_all()
        sum_writer = tf.summary.FileWriter(logdir=config.summary.test_dir)

        for checkpoint_path in evaluation.checkpoints_iterator(
                config.finetune.eval_checkpoint_dir):
            with tf.Session(config=configproto) as session:
                session.run(tf.global_variables_initializer())
                session.run(data.get_iterator(is_train=False).initializer)
                saver.restore(session, checkpoint_path)

                logging.info('Starting evaluation at ' +
                             time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
                k = 1
                tp = []
                tp_fp = []
                tp_fn = []
                imgs = []
                while True:
                    try:
                        pred_, all_pred_, all_true_, pred_img, true_img, g_step = session.run(
                            [
                                pred, all_pred, all_true, f_score_img,
                                y_test_img, global_step
                            ])
                        tp.append(np.expand_dims(pred_, 0))
                        tp_fp.append(np.expand_dims(all_true_, 0))
                        tp_fn.append(np.expand_dims(all_pred_, 0))
                        #img = util.merge_pics(pred_img, true_img)

                        print("deal with {} images".format(
                            k * config.input.batch_size))
                        k += 1
                    except tf.errors.OutOfRangeError:
                        tp_ = np.sum(np.concatenate(tp, 0), 0)
                        tp_fn_ = np.sum(np.concatenate(tp_fn, 0), 0)
                        tp_fp_ = np.sum(np.concatenate(tp_fp, 0), 0)
                        precison = tp_ / tp_fp_
                        recall = tp_ / tp_fn_
                        dice = 2 * tp_ / (tp_fp_ + tp_fn_)

                        print(precison)
                        print(recall)
                        print(dice)
                        summary = tf.Summary()
                        for i in range(recall.shape[0]):
                            summary.value.add(
                                tag='evaluation/{}th_class_precision'.format(
                                    i),
                                simple_value=precison[i])
                            summary.value.add(
                                tag='evaluation/{}th_class_recall'.format(i),
                                simple_value=recall[i])
                            summary.value.add(
                                tag='evaluation/{}th_class_dice'.format(i),
                                simple_value=dice[i])
                        sum_writer.add_summary(summary, g_step)

                        break
                logging.info('Finished evaluation at ' +
                             time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
Ejemplo n.º 29
0
def main(argv):
  FLAGS = argv[0]  # pylint:disable=invalid-name,redefined-outer-name
  tf.logging.set_verbosity(tf.logging.INFO)

  # RevNet specific configuration
  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)

  if FLAGS.use_tpu:
    tf.logging.info("Using TPU.")
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
  else:
    tpu_cluster_resolver = None

  # TPU specific configuration
  tpu_config = tf.contrib.tpu.TPUConfig(
      # Recommended to be set as number of global steps for next checkpoint
      iterations_per_loop=FLAGS.iterations_per_loop,
      num_shards=FLAGS.num_shards)

  # Estimator specific configuration
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=False),
      tpu_config=tpu_config,
  )

  # Construct TPU Estimator
  estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=config.tpu_batch_size,
      eval_batch_size=config.tpu_eval_batch_size,
      config=run_config,
      params={
          "FLAGS": FLAGS,
          "config": config,
      })

  # Construct input functions
  train_input_fn = get_input_fn(
      config=config, data_dir=FLAGS.data_dir, split="train_all")
  eval_input_fn = get_input_fn(
      config=config, data_dir=FLAGS.data_dir, split="test")

  # Disabling a range within an else block currently doesn't work
  # due to https://github.com/PyCQA/pylint/issues/872
  # pylint: disable=protected-access
  if FLAGS.mode == "eval":
    # TPUEstimator.evaluate *requires* a steps argument.
    # Note that the number of examples used during evaluation is
    # --eval_steps * --batch_size.
    # So if you change --batch_size then change --eval_steps too.
    eval_steps = 10000 // config.tpu_eval_batch_size

    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info("Starting to evaluate.")
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = estimator.evaluate(
            input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                        (eval_results, elapsed_time))

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split("-")[1])
        if current_step >= config.max_train_iter:
          tf.logging.info(
              "Evaluation finished after training step %d" % current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)

  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator_._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)
    tf.logging.info("Training for %d steps . Current"
                    " step %d." % (config.max_train_iter, current_step))

    start_timestamp = time.time()  # This time will include compilation time
    if FLAGS.mode == "train":
      estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter)
    else:
      eval_steps = 10000 // config.tpu_eval_batch_size
      assert FLAGS.mode == "train_and_eval"
      while current_step < config.max_train_iter:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              config.max_train_iter)
        estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be consistently excluded modulo the batch size.
        tf.logging.info("Starting to evaluate.")
        eval_results = estimator.evaluate(
            input_fn=eval_input_fn, steps=eval_steps)
        tf.logging.info("Eval results: %s" % eval_results)

    elapsed_time = int(time.time() - start_timestamp)
    tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
                    (config.max_train_iter, elapsed_time))
Ejemplo n.º 30
0
def main(unused_argv):
    params = resnet_params.from_file(FLAGS.param_file)
    params = resnet_params.override(params, FLAGS.param_overrides)
    resnet_params.log_hparams_to_model_dir(params, FLAGS.model_dir)
    tf.logging.info('Model params: {}'.format(params))

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, params['iterations_per_loop'])
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    if FLAGS.inference_with_all_cores:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=resnet_model_fn,
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu,
            experimental_exported_model_uses_all_cores=FLAGS.
            inference_with_all_cores)
    else:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=resnet_model_fn,
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu)
    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=use_bfloat16,
                transpose_input=params['transpose_input'],
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, params['iterations_per_loop'])))
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params['train_steps'],
                                    hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
            if FLAGS.add_warmup_requests:
                inference_warmup.write_warmup_requests(
                    export_path,
                    FLAGS.model_name,
                    params['image_size'],
                    batch_sizes=FLAGS.inference_batch_sizes,
                    image_format='JPEG')
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    # RevNet specific configuration
    revnet_config = {
        "revnet-56": config_.get_hparams_imagenet_56(),
        "revnet-104": config_.get_hparams_imagenet_104()
    }[FLAGS.revnet_config]

    if FLAGS.use_tpu:
        revnet_config.data_format = "channels_last"

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    # Estimator specific configuration
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
            PER_HOST_V2),
    )

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(is_training=is_training,
                                     data_dir=FLAGS.data_dir,
                                     transpose_input=FLAGS.transpose_input,
                                     use_bfloat16=False)
        for is_training in [True, False]
    ]

    revnet_classifier = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=revnet_config.tpu_batch_size,
        eval_batch_size=revnet_config.tpu_eval_batch_size,
        config=config,
        export_to_tpu=False,
        params={"revnet_config": revnet_config})

    steps_per_epoch = revnet_config.tpu_iters_per_epoch
    eval_steps = revnet_config.tpu_eval_steps

    # pylint: disable=protected-access
    if FLAGS.mode == "eval":
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info("Starting to evaluate.")
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = revnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split("-")[1])
                if current_step >= revnet_config.max_train_iter:
                    tf.logging.info(
                        "Evaluation finished after training step %d" %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    "Checkpoint %s no longer exists, skipping checkpoint" %
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)

        tf.logging.info(
            "Training for %d steps (%.2f epochs in total). Current"
            " step %d." %
            (revnet_config.max_train_iter,
             revnet_config.max_train_iter / steps_per_epoch, current_step))

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == "train":
            revnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=revnet_config.max_train_iter)

        else:
            assert FLAGS.mode == "train_and_eval"
            while current_step < revnet_config.max_train_iter:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      revnet_config.max_train_iter)
                revnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    "Finished training up to step %d. Elapsed seconds %d." %
                    (next_checkpoint, int(time.time() - start_timestamp)))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info("Starting to evaluate.")
                eval_results = revnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn, steps=eval_steps)
                tf.logging.info("Eval results: %s" % eval_results)

                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info(
                    "Finished training up to step %d. Elapsed seconds %d." %
                    (revnet_config.max_train_iter, elapsed_time))

        if FLAGS.export_dir is not None:
            # The guide to serve an exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info("Starting to export model.")
            revnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Ejemplo n.º 32
0
 def testMonitorCheckpointsLoopTimeout(self):
     ret = list(
         evaluation_lib.checkpoints_iterator('/non-existent-dir',
                                             timeout=0))
     self.assertEqual(ret, [])
Ejemplo n.º 33
0
def main(unused_argv):
    del unused_argv  # Unused

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', FLAGS.precision)

    batch_size_per_shard = FLAGS.train_batch_size // FLAGS.num_shards
    params = {
        'model_transpose_dims': [0, 1, 2, 3],
        'pipeline_transpose_dims': [0, 1, 2, 3],
    }

    batch_axis = 0
    if FLAGS.transpose_enabled:
        # On the TPU, convolutions are executed with a different leading
        # dimension when batch size per shard is less than 64. By
        # default images are loaded in NHWC order. For optimal performance,
        # we want to use CHWN order while training wjem batch size per
        # worker is smaller than 64 per shard.

        if batch_size_per_shard >= 64:
            params['model_transpose_dims'] = [3, 0, 1, 2]
            params['pipeline_transpose_dims'] = [1, 2, 3, 0]
            batch_axis = 3
        else:
            params['model_transpose_dims'] = [2, 0, 1, 3]
            params['pipeline_transpose_dims'] = [1, 2, 0, 3]
            batch_axis = 2

    if FLAGS.eval_total_size > 0:
        eval_size = FLAGS.eval_total_size
    else:
        eval_size = _NUM_EVAL_IMAGES
    eval_steps = eval_size // FLAGS.eval_batch_size

    iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations)

    eval_batch_size = (None
                       if FLAGS.mode == 'train' else FLAGS.eval_batch_size)

    per_host_input_for_training = (FLAGS.num_shards <= 8
                                   if FLAGS.mode == 'train' else True)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=iterations,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=per_host_input_for_training))

    inception_classifier = tf.contrib.tpu.TPUEstimator(
        model_fn=inception_model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params=params,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=eval_batch_size,
        batch_axis=(batch_axis, 0))

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    use_bfloat16 = FLAGS.precision == 'bfloat16'
    imagenet_train = InputPipeline(is_training=True,
                                   data_dir=FLAGS.data_dir,
                                   use_bfloat16=use_bfloat16)
    imagenet_eval = InputPipeline(is_training=False,
                                  data_dir=FLAGS.data_dir,
                                  use_bfloat16=use_bfloat16)

    if FLAGS.moving_average:
        eval_hooks = [LoadEMAHook(FLAGS.model_dir)]
    else:
        eval_hooks = []

    if FLAGS.mode == 'eval':
        # Run evaluation when there is a new checkpoint
        for checkpoint in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time()  # Includes compilation time
                eval_results = inception_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(checkpoint).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break
            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    checkpoint)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
            tf.logging.info('Starting training cycle %d.' % cycle)
            inception_classifier.train(input_fn=imagenet_train.input_fn,
                                       steps=FLAGS.train_steps_per_eval)

            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Starting training ...')
        inception_classifier.train(input_fn=imagenet_train.input_fn,
                                   max_steps=FLAGS.train_steps)

    if FLAGS.export_dir is not None:
        tf.logging.info('Starting to export model.')
        inception_classifier.export_saved_model(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=image_serving_input_fn)
Ejemplo n.º 34
0
def main(argv):
    del argv  # Unused.

    tf.enable_resource_variables()
    tf.set_random_seed(FLAGS.seed)
    set_lr_schedule()
    set_custom_sparsity_map()
    folder_stub = os.path.join(FLAGS.training_method, str(FLAGS.end_sparsity),
                               str(FLAGS.maskupdate_begin_step),
                               str(FLAGS.maskupdate_end_step),
                               str(FLAGS.maskupdate_frequency),
                               str(FLAGS.drop_fraction),
                               str(FLAGS.label_smoothing),
                               str(FLAGS.weight_decay))

    output_dir = FLAGS.output_dir
    if FLAGS.use_folder_stub:
        output_dir = os.path.join(output_dir, folder_stub)

    export_dir = os.path.join(output_dir, 'export_dir')

    # we pass the updated eval and train string to the params dictionary.
    params = {}
    params['output_dir'] = output_dir
    params['training_method'] = FLAGS.training_method
    params['use_tpu'] = FLAGS.use_tpu

    dataset_func = functools.partial(
        imagenet_input.ImageNetInput,
        data_dir=FLAGS.data_directory,
        transpose_input=False,
        num_parallel_calls=FLAGS.num_parallel_calls,
        use_bfloat16=False)
    imagenet_train, imagenet_eval = [
        dataset_func(is_training=is_training) for is_training in [True, False]
    ]

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=output_dir,
        save_checkpoints_steps=FLAGS.steps_per_checkpoint,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            tpu_job_name=FLAGS.tpu_job_name))

    classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    cpu_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        export_to_tpu=False,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0:
        raise ValueError(
            'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' %
            (FLAGS.eval_batch_size, FLAGS.num_eval_images))

    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    if FLAGS.mode == 'eval_once':
        ckpt_path = os.path.join(output_dir, FLAGS.eval_once_ckpt_prefix)
        dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval
        classifier.evaluate(input_fn=dataset.input_fn,
                            steps=eval_steps,
                            checkpoint_path=ckpt_path,
                            name='{0}'.format(FLAGS.eval_once_ckpt_prefix))
    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(output_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval
                classifier.evaluate(input_fn=dataset.input_fn,
                                    steps=eval_steps,
                                    checkpoint_path=ckpt,
                                    name='eval')
                # Terminate eval job when final checkpoint is reached
                global_step = int(os.path.basename(ckpt).split('-')[1])
                if global_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        global_step)
                    break

            except tf.errors.NotFoundError:
                logging('Checkpoint no longer exists,skipping checkpoint.')

    else:
        global_step = estimator._load_global_step_from_checkpoint_dir(
            output_dir)
        # Session run hooks to export model for prediction
        export_hook = ExportModelHook(cpu_classifier, export_dir)
        hooks = [export_hook]

        if FLAGS.mode == 'train':
            tf.logging.info('start training...')
            classifier.train(input_fn=imagenet_train.input_fn,
                             hooks=hooks,
                             max_steps=FLAGS.train_steps)
        else:
            assert FLAGS.mode == 'train_and_eval'
            tf.logging.info('start training and eval...')
            while global_step < FLAGS.train_steps:
                next_checkpoint = min(global_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                classifier.train(input_fn=imagenet_train.input_fn,
                                 max_steps=next_checkpoint)
                global_step = next_checkpoint
                logging('Completed training up to step :', global_step)
                classifier.evaluate(input_fn=imagenet_eval.input_fn,
                                    steps=eval_steps)
Ejemplo n.º 35
0
def main(unused_argv):
  tf.flags.mark_flag_as_required('model_dir')
  tf.flags.mark_flag_as_required('pipeline_config_path')

  if FLAGS.master is None and FLAGS.tpu_name is None:
    raise RuntimeError('You must specify either --master or --tpu_name.')

  if FLAGS.master is not None:
    if FLAGS.tpu_name is not None:
      tf.logging.warn('Both --master and --tpu_name are set. Ignoring '
                      '--tpu_name and using --master.')
    tpu_grpc_url = FLAGS.master
  else:
    tpu_cluster_resolver = (
        tf.contrib.cluster_resolver.python.training.TPUClusterResolver(
            tpu_names=[FLAGS.tpu_name],
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project))
    tpu_grpc_url = tpu_cluster_resolver.get_master()

  config = tpu_config.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))
  params = {}
  (estimator, train_input_fn, eval_validation_input_fn, eval_training_input_fn,
   train_steps, eval_steps) = (
       create_estimator(
           config,
           model_hparams.create_hparams(
               hparams_overrides=FLAGS.hparams_overrides),
           FLAGS.pipeline_config_path,
           train_steps=FLAGS.num_train_steps,
           eval_steps=FLAGS.num_eval_steps,
           train_batch_size=FLAGS.train_batch_size,
           use_tpu=FLAGS.use_tpu,
           num_shards=FLAGS.num_shards,
           params=params))

  if FLAGS.mode in ['train', 'train_and_eval']:
    estimator.train(input_fn=train_input_fn, max_steps=train_steps)

  if FLAGS.mode == 'train_and_eval':
    # Eval one time.
    eval_results = estimator.evaluate(
        input_fn=eval_validation_input_fn, steps=eval_steps)
    tf.logging.info('Eval results: %s' % eval_results)

  # Continuously evaluating.
  if FLAGS.mode == 'eval':
    def terminate_eval():
      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
                      FLAGS.eval_timeout_secs)
      return True

    # Run evaluation when there's a new checkpoint.
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval_secs,
        timeout=FLAGS.eval_timeout_secs,
        timeout_fn=terminate_eval):

      tf.logging.info('Starting to evaluate.')
      if FLAGS.eval_training_data:
        name = 'training_data'
        input_fn = eval_training_input_fn
      else:
        name = 'validation_data'
        input_fn = eval_validation_input_fn
      try:
        eval_results = estimator.evaluate(
            input_fn=input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt,
            name=name)
        tf.logging.info('Eval results: %s' % eval_results)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d' % current_step)
          break

      except tf.errors.NotFoundError:
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)
Ejemplo n.º 36
0
def main(unused_argv):
    if FLAGS.use_tpu:
        if FLAGS.master is None and FLAGS.tpu_name is None:
            raise RuntimeError(
                "You must specify either --master or --tpu_name.")

        if FLAGS.master is not None:
            if FLAGS.tpu_name is not None:
                tf.logging.warn(
                    "Both --master and --tpu_name are set. Ignoring "
                    "--tpu_name and using --master.")
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
            tpu_grpc_url = tpu_cluster_resolver.get_master()
    else:
        # URL is unused if running locally without TPU
        tpu_grpc_url = None

    batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size
    steps_per_checkpoint = FLAGS.steps_per_checkpoint
    iterations_per_loop = FLAGS.iterations_per_loop
    eval_steps = _NUM_EVAL_IMAGES // FLAGS.eval_batch_size
    if iterations_per_loop is None or steps_per_checkpoint < iterations_per_loop:
        iterations_per_loop = steps_per_checkpoint
    if FLAGS.mode == "eval":
        iterations_per_loop = eval_steps
    params = {
        "batches_per_epoch": batches_per_epoch,
    }

    config = tpu_config.RunConfig(master=tpu_grpc_url,
                                  evaluation_master=tpu_grpc_url,
                                  model_dir=FLAGS.model_dir,
                                  save_checkpoints_steps=steps_per_checkpoint,
                                  log_step_count_steps=iterations_per_loop,
                                  tpu_config=tpu_config.TPUConfig(
                                      iterations_per_loop=iterations_per_loop,
                                      num_shards=FLAGS.num_shards))

    densenet_estimator = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        params=params)

    if FLAGS.mode == "train":
        tf.logging.info(
            "Training for %d steps (%.2f epochs in total)." %
            (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch))
        densenet_estimator.train(input_fn=ImageNetInput(True),
                                 max_steps=FLAGS.train_steps)

    elif FLAGS.mode == "train_and_eval":
        current_step = 0
        tf.logging.info(
            "Training for %d steps (%.2f epochs in total). Current "
            "step %d" % (FLAGS.train_steps,
                         FLAGS.train_steps / batches_per_epoch, current_step))
        while current_step < FLAGS.train_steps:
            next_checkpoint = min(current_step + steps_per_checkpoint,
                                  FLAGS.train_steps)
            num_steps = next_checkpoint - current_step
            current_step = next_checkpoint
            densenet_estimator.train(input_fn=ImageNetInput(True),
                                     steps=num_steps)

            tf.logging.info("Starting to evaluate.")
            eval_results = densenet_estimator.evaluate(
                input_fn=ImageNetInput(False),
                steps=_NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
            tf.logging.info("Eval results: %s" % eval_results)

    else:

        def terminate_eval():
            tf.logging.info(
                "Terminating eval after %d seconds of no checkpoints" %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there"s a new checkpoint
        # If the evaluation worker is delayed in processing a new checkpoint,
        # the checkpoint file may be deleted by the trainer before it can be
        # evaluated.
        # Ignore the error in this case.
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info("Starting to evaluate.")
            try:
                eval_results = densenet_estimator.evaluate(
                    input_fn=ImageNetInput(False),
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                tf.logging.info("Eval results: %s" % eval_results)
            except tf.errors.NotFoundError:
                tf.logging.info(
                    "Checkpoint %s no longer exists, skipping checkpoint")
Ejemplo n.º 37
0
def main(unused_argv):
    params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file,
                                                 FLAGS.hparams_file, FLAGS,
                                                 FLAGS.hparams)
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(2500, params['iterations_per_loop'])
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=get_model_dir(params),
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=None,  # Keep all checkpoints.
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            # copybara:strip_begin
            tpu_job_name=FLAGS.tpu_job_name,
            # copybara:strip_end
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=params['use_tpu'],
        model_fn=resnet_model_fn,
        config=config,
        params=params,
        train_batch_size=params['train_batch_size'],
        eval_batch_size=params['eval_batch_size'],
        export_to_tpu=FLAGS.export_to_tpu)

    # copybara:strip_begin
    if FLAGS.xla_compile:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=xla.estimator_model_fn(resnet_model_fn),
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu)
    # copybara:strip_end
    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train = imagenet_input.ImageNetBigtableInput(
            is_training=True,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_train)
        imagenet_eval = imagenet_input.ImageNetBigtableInput(
            is_training=False,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_eval)
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                get_model_dir(params), timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'eval_igt':
        # IGT evaluation mode. Evaluate metrics for the desired parameters
        # (true or shifted) on the desired dataset (train or eval). Note that
        # train is still with data augmentation.

        # Get checkpoint file names.
        index_files = tf.gfile.Glob(
            os.path.join(get_model_dir(params), 'model.ckpt-*.index'))
        checkpoints = [fn[:-len('.index')] for fn in index_files]
        # Need to sort them to get proper tensorboard plotting (increasing event
        # timestamps correspond to increasing steps).
        checkpoint_steps = []
        for ckpt in checkpoints:
            tf.logging.info(ckpt)
            step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt)
            checkpoint_steps.append(int(step_match.group(1)))
        checkpoints = [
            ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints))
        ]
        tf.logging.info('There are {} checkpoints'.format(len(checkpoints)))
        tf.logging.info(', '.join(checkpoints))

        # Keep track of the last processed checkpoint (fault tolerance).
        analysis_state_path = os.path.join(
            get_model_dir(params),
            'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
        next_analysis_index = 0
        if tf.gfile.Exists(analysis_state_path):
            with tf.gfile.Open(analysis_state_path) as fd:
                next_analysis_index = int(fd.read())

        # Process each checkpoint.
        while next_analysis_index < len(checkpoints):
            tf.logging.info(
                'Next analysis index: {}'.format(next_analysis_index))
            ckpt_path = checkpoints[next_analysis_index]
            tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path))
            start_timestamp = time.time(
            )  # This time will include compilation time

            if FLAGS.igt_eval_set == 'train':
                the_input_fn = imagenet_train.input_fn
                the_steps = steps_per_epoch
            elif FLAGS.igt_eval_set == 'eval':
                the_input_fn = imagenet_eval.input_fn
                the_steps = eval_steps
            else:
                raise ValueError('Unsupported igt_eval_set')

            eval_results = resnet_classifier.evaluate(
                input_fn=the_input_fn,
                steps=the_steps,
                checkpoint_path=ckpt_path,
                name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                            eval_results, elapsed_time)

            next_analysis_index += 1
            file_io.atomic_write_string_to_file(analysis_state_path,
                                                str(next_analysis_index))

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            get_model_dir(params))  # pylint:disable=protected-access,g-line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=get_model_dir(params),
                        save_steps=max(2500, params['iterations_per_loop'])))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params['train_steps'],
                                    hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            unused_export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Ejemplo n.º 38
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet-edgetpu'):
            _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet-tpu'):
            _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set except for EfficientNet')

    # For imagenet dataset, include background label if number of output classes
    # is 1001
    include_background_label = (FLAGS.num_label_classes == 1001)

    if FLAGS.tpu or FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
                                      model_fn=model_fn,
                                      config=config,
                                      train_batch_size=FLAGS.train_batch_size,
                                      eval_batch_size=FLAGS.eval_batch_size,
                                      export_to_tpu=FLAGS.export_to_tpu,
                                      params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    def build_imagenet_input(is_training):
        """Generate ImageNetInput for training and eval."""
        if FLAGS.bigtable_instance:
            tf.logging.info('Using Bigtable dataset, table %s',
                            FLAGS.bigtable_table)
            select_train, select_eval = _select_tables_from_flags()
            return imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=FLAGS.use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=select_train if is_training else select_eval,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)
        else:
            if FLAGS.data_dir == FAKE_DATA_DIR:
                tf.logging.info('Using fake dataset.')
            else:
                tf.logging.info('Using dataset: %s', FLAGS.data_dir)

            return imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)

    imagenet_train = build_imagenet_input(is_training=True)
    imagenet_eval = build_imagenet_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=imagenet_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=imagenet_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
Ejemplo n.º 39
0
def main(unused_argv):
  # [START tpu-cluster-revolver]
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu,
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)

  config = tpu_config.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=max(600, FLAGS.iterations_per_loop),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_cores,
          per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2))  # pylint: disable=line-too-long
  # [END tpu-cluster-revolver]

  resnet_classifier = tpu_estimator.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=resnet_model_fn,
      config=config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size)

  assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
      'Invalid value for --precision flag; must be bfloat16 or float32.')
  tf.logging.info('Precision: %s', FLAGS.precision)
  use_bfloat16 = FLAGS.precision == 'bfloat16'

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  imagenet_train, imagenet_eval = [imagenet_input.ImageNetInput(
      is_training=is_training,
      data_dir=FLAGS.data_dir,
      transpose_input=FLAGS.transpose_input,
      use_bfloat16=use_bfloat16) for is_training in [True, False]]

  if FLAGS.mode == 'eval':
    eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size

    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                        (eval_results, elapsed_time))

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= FLAGS.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d' % current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)

  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
    batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size
    tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                    ' step %d.' % (FLAGS.train_steps,
                                   FLAGS.train_steps / batches_per_epoch,
                                   current_step))

    start_timestamp = time.time()  # This time will include compilation time
    if FLAGS.mode == 'train':
      resnet_classifier.train(
          input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < FLAGS.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              FLAGS.train_steps)
        resnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be consistently excluded modulo the batch size.
        tf.logging.info('Starting to evaluate.')
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
        tf.logging.info('Eval results: %s' % eval_results)

    elapsed_time = int(time.time() - start_timestamp)
    tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' %
                    (FLAGS.train_steps, elapsed_time))

    if FLAGS.export_dir is not None:
      # The guide to serve a exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info('Starting to export model.')
      resnet_classifier.export_savedmodel(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)