def train_fn(hparams, num_workers):
    """Copy of train function from estimator.py."""
    # TODO: Merge improvements into the original.
    # pylint: disable=protected-access
    hparams.tgt_sos_id, hparams.tgt_eos_id = nmt_estimator._get_tgt_sos_eos_id(
        hparams)
    model_fn = nmt_estimator.make_model_fn(hparams)

    def print_log():
        mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)
        mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=0)
        mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE,
                              value=hparams.num_examples_per_epoch)

    if hparams.use_tpu_low_level_api:
        runner = create_train_runner(hparams, num_workers)
        mlperf_log.gnmt_print(key=mlperf_log.RUN_START)
        input_fn = DistributedPipeline(hparams, num_workers)
        runner.initialize(input_fn, {})
        runner.build_model(model_fn, {})
        print_log()
        runner.train(0, hparams.num_train_steps)
        return 0.0

    # cluster = tf.contrib.cluster_resolver.TPUClusterResolver(hparams.tpu_name)
    # cluster_spec = cluster.cluster_spec()
    # print('cluster_spec: %s' % cluster_spec)
    # num_workers = cluster_spec.num_tasks('tpu_worker')
    # print('num_workers: %s' % num_workers)

    pipeline = DistributedPipeline(hparams, num_workers)
    print_log()

    if hparams.use_tpu:
        run_config = nmt_estimator._get_tpu_run_config(hparams, True)
        estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=model_fn,
            config=run_config,
            use_tpu=hparams.use_tpu,
            train_batch_size=hparams.batch_size,
            eval_batch_size=hparams.batch_size,
            predict_batch_size=hparams.infer_batch_size)
    else:
        raise ValueError("Distributed input pipeline only supported on TPUs.")

    hooks = [pipeline]
    if hparams.use_async_checkpoint:
        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=hparams.out_dir,
                save_steps=int(hparams.num_examples_per_epoch /
                               hparams.batch_size)))

    estimator.train(input_fn=pipeline,
                    max_steps=hparams.num_train_steps,
                    hooks=hooks)
    # Return value is not used
    return 0.0
Esempio n. 2
0
def train_fn(hparams):
  """Train function."""
  hparams.tgt_sos_id, hparams.tgt_eos_id = _get_tgt_sos_eos_id(hparams)
  model_fn = make_model_fn(hparams)

  def print_log():
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=0)
    mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE,
                          value=hparams.num_examples_per_epoch)

  if hparams.use_tpu_low_level_api:
    runner = create_train_runner_and_build_graph(hparams, model_fn)
    print_log()
    runner.train(0, hparams.num_train_steps)
    return 0.0

  input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN)
  print_log()
  if hparams.use_tpu:
    run_config = _get_tpu_run_config(hparams, True)
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        config=run_config,
        use_tpu=hparams.use_tpu,
        train_batch_size=hparams.batch_size,
        eval_batch_size=hparams.batch_size,
        predict_batch_size=hparams.infer_batch_size)
  else:
    distribution_strategy = get_distribution_strategy(hparams.num_gpus)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=hparams.out_dir,
        config=tf.estimator.RunConfig(train_distribute=distribution_strategy))

  hooks = []
  if hparams.use_async_checkpoint:
    hooks.append(
        async_checkpoint.AsyncCheckpointSaverHook(
            checkpoint_dir=hparams.out_dir,
            save_steps=int(
                hparams.num_examples_per_epoch / hparams.batch_size)))

  estimator.train(
      input_fn=input_fn, max_steps=hparams.num_train_steps, hooks=hooks)
  # Return value is not used
  return 0.0
Esempio n. 3
0
def main(argv):
    del argv  # Unused.
    global SUCCESS
    print(FLAGS.model_dir)
    if FLAGS.model_dir: print(FLAGS.model_dir)
    else:
        print(FLAGS.training_file_pattern)
        raise Exception('No model dir')
    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    run_config, params = construct_run_config(FLAGS.iterations_per_loop)

    if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once':
        if params['train_with_low_level_api']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            trunner = train_low_level_runner.TrainLowLevelRunner(
                iterations=FLAGS.iterations_per_loop)
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            mlperf_log.ssd_print(key=mlperf_log.RUN_START)
            trunner.initialize(input_fn, ssd_model.ssd_model_fn, params)
        else:
            mlperf_log.ssd_print(key=mlperf_log.RUN_START)

    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if params['eval_with_low_level_api']:
            params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards
            erunner = eval_low_level_runner.EvalLowLevelRunner(
                eval_steps=int(FLAGS.eval_samples / FLAGS.eval_batch_size))
            input_fn = dataloader.SSDInputReader(
                FLAGS.validation_file_pattern,
                is_training=False,
                use_fake_data=FLAGS.use_fake_data)
            erunner.initialize(input_fn, params)
            erunner.build_model(ssd_model.ssd_model_fn, params)

    # TPU Estimator
    if FLAGS.mode == 'train':
        if params['train_with_low_level_api']:
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0)
            trunner.train(train_steps)
            trunner.shutdown()
        else:
            if FLAGS.device == 'gpu':
                params['dataset_num_shards'] = 1
                params['dataset_index'] = 0
                train_params = dict(params)
                train_params['batch_size'] = FLAGS.train_batch_size
                train_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=train_params)
            else:
                train_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    config=run_config,
                    params=params)

            tf.logging.info(params)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0)
            hooks = []

            if FLAGS.use_async_checkpoint:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            train_estimator.train(
                input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size),
                hooks=hooks)

        if FLAGS.eval_after_training:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

            predictions = list(
                eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                    FLAGS.validation_file_pattern,
                    is_training=False,
                    use_fake_data=FLAGS.use_fake_data)))

            eval_results = coco_metric.compute_map(predictions,
                                                   FLAGS.val_json_file)

            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'train_and_eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        current_step = 0
        mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
        threads = []
        for eval_step in ssd_constants.EVAL_STEPS:
            # Compute the actual eval steps based on the actural train_batch_size
            steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE /
                        FLAGS.train_batch_size)
            current_epoch = current_step // params['steps_per_epoch']
            # TODO(wangtao): figure out how to log for each epoch.
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH,
                                 value=current_epoch)

            tf.logging.info('Starting training cycle for %d steps.' % steps)
            if params['train_with_low_level_api']:
                trunner.train(steps)
            else:
                run_config, params = construct_run_config(steps)
                if FLAGS.device == 'gpu':
                    train_params = dict(params)
                    train_params['batch_size'] = FLAGS.train_batch_size
                    train_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=train_params)
                else:
                    train_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        config=run_config,
                        params=params)

                tf.logging.info(params)
                train_estimator.train(input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                                      steps=steps)

            if SUCCESS:
                break

            current_step = current_step + steps
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('Starting evaluation cycle at step %d.' %
                            current_step)
            mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                 value=current_epoch)
            # Run evaluation at the given step.
            if params['eval_with_low_level_api']:
                predictions = list(erunner.predict())
            else:
                if FLAGS.device == 'gpu':
                    eval_params = dict(params)
                    eval_params['batch_size'] = FLAGS.eval_batch_size
                    eval_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=eval_params)
                else:
                    eval_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        predict_batch_size=FLAGS.eval_batch_size,
                        config=run_config,
                        params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            t = threading.Thread(target=coco_eval,
                                 args=(predictions, current_epoch,
                                       current_step, summary_writer))
            threads.append(t)
            t.start()

        trunner.shutdown()

        for t in threads:
            t.join()

        # success is a string right now as boolean is not JSON serializable.
        if not SUCCESS:
            mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                 value={'success': 'false'})
            mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()

    elif FLAGS.mode == 'eval':
        if not params['eval_with_low_level_api']:
            if FLAGS.device == 'gpu':
                eval_params = dict(params)
                eval_params['batch_size'] = FLAGS.eval_batch_size
                eval_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=eval_params)
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        eval_epochs = [
            steps * ssd_constants.DEFAULT_BATCH_SIZE /
            FLAGS.train_batch_size // params['steps_per_epoch']
            for steps in eval_steps
        ]

        # For 8x8 slices and above.
        if FLAGS.train_batch_size >= 4096:
            eval_epochs = [i * 2 for i in eval_epochs]

        tf.logging.info('Eval epochs: %s' % eval_epochs)
        # Run evaluation when there's a new checkpoint
        threads = []
        count = 1
        for ckpt in next_checkpoint(FLAGS.model_dir):
            print("current count is {}\n".format(count))
            count += 1
            if SUCCESS:
                break
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('current step: %s' % current_step)
            tf.logging.info('current epoch: %s' % current_epoch)
            if not params[
                    'eval_every_checkpoint'] and current_epoch not in eval_epochs:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                     value=current_epoch)

                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                t = threading.Thread(target=coco_eval,
                                     args=(predictions, current_epoch,
                                           current_step, summary_writer))
                threads.append(t)
                t.start()

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

        for t in threads:
            t.join()

        if not SUCCESS:
            mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                 value={'success': 'false'})
            mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()
    elif FLAGS.mode == 'eval_once':
        if not params['eval_with_low_level_api']:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        # Run evaluation when there's a new checkpoint
        for ckpt in next_checkpoint(FLAGS.model_dir):
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            print('current epoch: %s' % current_epoch)
            if FLAGS.eval_epoch < current_epoch:
                break
            if FLAGS.eval_epoch > current_epoch:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                     value=current_epoch)

                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                coco_eval(predictions, current_epoch, current_step,
                          summary_writer)

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    if not SUCCESS:
                        mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                             value={'success': 'false'})
                        mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
                    print('Evaluation finished after training step %d' %
                          current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                print('Checkpoint %s no longer exists, skipping checkpoint' %
                      ckpt)

        print('%d ending' % FLAGS.eval_epoch)
        summary_writer.close()
def main(unused_argv):
  steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size
  tpu = FLAGS.tpu or FLAGS.master
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      tpu if (tpu or FLAGS.use_tpu) else '',
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)

  if FLAGS.use_train_runner:
    if FLAGS.mode == 'in_memory_eval':
      low_level_runner = train_and_eval_runner.TrainAndEvalRunner(
          iterations=FLAGS.iterations_per_loop,
          train_steps=FLAGS.train_steps,
          eval_steps=int(
              math.ceil(FLAGS.num_eval_images / FLAGS.eval_batch_size)))
    else:
      trunner = train_runner.TrainRunner(
          iterations=FLAGS.iterations_per_loop, train_steps=FLAGS.train_steps)

  if FLAGS.mode != 'eval':
    mlp_log.mlperf_print('init_start', None)

  if FLAGS.use_async_checkpointing or FLAGS.mode == 'in_memory_eval':
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
  mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size)
  if not FLAGS.use_train_runner:
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        save_summary_steps=0,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            tpu_job_name=FLAGS.tpu_job_name,
            # num_shards=FLAGS.num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        export_to_tpu=False)

  assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
      'Invalid value for --precision flag; must be bfloat16 or float32.')
  tf.logging.info('Precision: %s', FLAGS.precision)
  use_bfloat16 = FLAGS.precision == 'bfloat16'

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput(
        is_training=is_training,
        use_bfloat16=use_bfloat16,
        transpose_input=FLAGS.transpose_input,
        selection=selection) for (is_training, selection) in
                                     [(True, select_train),
                                      (False, select_eval)]]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=FLAGS.transpose_input,
            cache=FLAGS.use_cache and is_training,
            image_size=FLAGS.image_size,
            num_parallel_calls=FLAGS.num_parallel_calls,
            num_cores=FLAGS.num_prefetch_threads,
            prefetch_depth_auto_tune=FLAGS.prefetch_depth_auto_tune,
            use_bfloat16=use_bfloat16) for is_training in [True, False]
    ]

  if FLAGS.use_train_runner and FLAGS.mode == 'train':
    params = {'batch_size': FLAGS.train_batch_size}
    trunner.initialize(imagenet_train.input_fn, resnet_model_fn, params)

  if FLAGS.use_train_runner and FLAGS.mode == 'in_memory_eval':
    params = {'batch_size': FLAGS.train_batch_size}
    low_level_runner.initialize(imagenet_train.input_fn, imagenet_eval.input_fn,
                                resnet_model_fn, params)

  if FLAGS.mode != 'eval':
    mlp_log.mlperf_print('init_stop', None)
    mlp_log.mlperf_print('run_start', None)

  mlp_log.mlperf_print('num_train_examples', FLAGS.num_train_images)
  mlp_log.mlperf_print('num_eval_examples', FLAGS.num_eval_images)

  steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size
  eval_steps = int(math.ceil(FLAGS.num_eval_images / FLAGS.eval_batch_size))

  if FLAGS.mode == 'eval':
    params = {
        'batch_size': FLAGS.eval_batch_size
    }
    if FLAGS.use_eval_runner:
      erunner = eval_runner.EvalRunner(
          input_fn=imagenet_eval.input_fn,
          model_fn=resnet_model_fn,
          params=params,
          num_steps=eval_steps)
    success = False
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      current_step = int(os.path.basename(ckpt).split('-')[1])
      try:
        start_timestamp = time.time()  # This time will include compilation time

        if FLAGS.use_eval_runner:
          eval_results = erunner.eval(
              num_steps=eval_steps, checkpoint_path=ckpt)
        else:
          eval_results = resnet_classifier.evaluate(
              input_fn=imagenet_eval.input_fn,
              steps=eval_steps,
              checkpoint_path=ckpt)

        mlp_log.mlperf_print(
            'eval_accuracy',
            float(eval_results['top_1_accuracy']),
            metadata={'epoch_num': max(current_step // steps_per_epoch - 1, 0)})
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                        eval_results, elapsed_time)
        if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold:
          success = True
          mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
          mlp_log.mlperf_print('run_final', None)
          break

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= FLAGS.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

    if not success:
      mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'})
      mlp_log.mlperf_print('run_final', None)

  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    if FLAGS.mode == 'train':
      if FLAGS.use_train_runner:
        trunner.train()
      else:
        hooks = []
        if FLAGS.use_async_checkpointing:
          hooks.append(
              async_checkpoint.AsyncCheckpointSaverHook(
                  checkpoint_dir=FLAGS.model_dir,
                  save_steps=max(100, FLAGS.iterations_per_loop)))
          resnet_classifier.train(
              input_fn=imagenet_train.input_fn,
              max_steps=FLAGS.train_steps,
              hooks=hooks)
    elif FLAGS.mode == 'in_memory_eval':
      if FLAGS.use_train_runner:
        low_level_runner.train_and_eval(
            enable_tracing=FLAGS.enable_auto_tracing)
        low_level_runner.shutdown()
      else:
        steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size
        hooks = []
        hooks.append(
            in_memory_eval.TPUInMemoryEvalHook(
                resnet_classifier,
                imagenet_eval.input_fn,
                steps_per_epoch,
                stop_threshold=FLAGS.stop_threshold,
                steps=FLAGS.num_eval_images // FLAGS.eval_batch_size,
                every_n_iter=steps_per_epoch * 4))
        if FLAGS.use_async_checkpointing:
          hooks.append(
              async_checkpoint.AsyncCheckpointSaverHook(
                  checkpoint_dir=FLAGS.model_dir,
                  save_steps=max(100, steps_per_epoch * 4)))
          resnet_classifier.train(
              input_fn=imagenet_train.input_fn,
              max_steps=FLAGS.train_steps,
              hooks=hooks)
    else:
      current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
      steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size

      tf.logging.info(
          'Training for %d steps (%.2f epochs in total). Current'
          ' step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch,
          current_step)

      start_timestamp = time.time()  # This time will include compilation time

      assert FLAGS.mode == 'train_and_eval'
      success = False
      while current_step < FLAGS.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              FLAGS.train_steps)
        resnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
        mlp_log.mlperf_print(
            'eval_accuracy',
            float(eval_results['top_1_accuracy']),
            metadata={'epoch_num': max(current_step // steps_per_epoch - 1, 0)})

        tf.logging.info('Eval results at step %d: %s',
                        next_checkpoint, eval_results)
        if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold:
          success = True
          mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
          mlp_log.mlperf_print('run_final', None)
          break

      elapsed_time = int(time.time() - start_timestamp)

      if not success:
        mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'})
        mlp_log.mlperf_print('run_final', None)

      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      FLAGS.train_steps, elapsed_time)

    if FLAGS.export_dir is not None:
      # The guide to serve a exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info('Starting to export model.')
      resnet_classifier.export_savedmodel(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)

  if FLAGS.use_train_runner and FLAGS.mode == 'train':
    trunner.shutdown()
Esempio n. 5
0
def main(argv):
    del argv  # Unused.
    global SUCCESS

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    run_config, params = construct_run_config(FLAGS.iterations_per_loop)
    mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size)
    mlp_log.mlperf_print('opt_base_learning_rate',
                         params['base_learning_rate'])
    mlp_log.mlperf_print('opt_weight_decay', params['weight_decay'])
    mlp_log.mlperf_print(
        'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards *
        params['distributed_group_size'])

    if FLAGS.mode in ('eval', 'eval_once'):
        coco_gt = coco_metric.create_coco(
            FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])

    if FLAGS.mode == 'train_and_eval' and params[
            'in_memory_eval'] and FLAGS.train_batch_size != FLAGS.eval_batch_size:
        raise RuntimeError(
            'train batch size should be equal to eval batch size for in memory eval.'
        )

    if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once' and not params[
            'in_memory_eval']:
        if params['train_with_low_level_api'] and not params['in_memory_eval']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            input_partition_dims = FLAGS.input_partition_dims
            if input_partition_dims is not None and params['transpose_input']:
                if params['batch_size'] > 8:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 3, 0]
                    ]
                else:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 0, 3]
                    ]
            trunner = train_low_level_runner.TrainLowLevelRunner(
                input_partition_dims=[input_partition_dims, None]
                if FLAGS.input_partition_dims else None,
                num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims))
                if FLAGS.input_partition_dims else 1,
                iterations=FLAGS.iterations_per_loop,
            )
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            trunner.initialize(input_fn, ssd_model.ssd_model_fn, params)

    if params[
            'eval_with_low_level_api'] and FLAGS.mode != 'train' and not params[
                'in_memory_eval']:
        params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards
        eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size))
        if params['distributed_eval']:
            erunner = dist_eval_low_level_runner.DistEvalLowLevelRunner(
                eval_steps=eval_steps)
        else:
            erunner = eval_low_level_runner.EvalLowLevelRunner(
                eval_steps=eval_steps)
        input_fn = dataloader.SSDInputReader(
            FLAGS.validation_file_pattern,
            is_training=False,
            use_fake_data=FLAGS.use_fake_data,
            distributed_eval=params['distributed_eval'],
            count=eval_steps * FLAGS.eval_batch_size)
        erunner.initialize(input_fn, params)
        erunner.build_model(ssd_model.ssd_model_fn, params)

    # TPU Estimator
    if FLAGS.mode == 'train':
        if params['train_with_low_level_api']:
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            trunner.train(train_steps)
            trunner.shutdown()
        else:
            if FLAGS.device == 'gpu':
                train_params = dict(params)
                train_params['batch_size'] = FLAGS.train_batch_size
                train_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=train_params)
            else:
                train_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    config=run_config,
                    params=params)

            tf.logging.info(params)
            hooks = []
            if FLAGS.use_async_checkpoint:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            train_estimator.train(
                input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size),
                hooks=hooks)

        if FLAGS.eval_after_training:
            if params['eval_with_low_level_api']:
                predictions = list(erunner.predict())
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            eval_results = coco_metric.compute_map(
                predictions,
                coco_gt,
                use_cpp_extension=params['use_cocoeval_cc'],
                nms_on_tpu=params['nms_on_tpu'])

            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'train_and_eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        if params['in_memory_eval']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            eval_steps = int(
                math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size))
            input_partition_dims = FLAGS.input_partition_dims
            if input_partition_dims is not None and params['transpose_input']:
                if params['batch_size'] > 8:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 3, 0]
                    ]
                else:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 0, 3]
                    ]
            runner = train_and_eval_low_level_runner.TrainAndEvalLowLevelRunner(
                iterations=FLAGS.iterations_per_loop,
                eval_steps=eval_steps,
                input_partition_dims=input_partition_dims
                if FLAGS.input_partition_dims else None,
                num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims))
                if FLAGS.input_partition_dims else 1,
            )
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            # Init for eval.
            eval_input_fn = dataloader.SSDInputReader(
                FLAGS.validation_file_pattern,
                is_training=False,
                use_fake_data=FLAGS.use_fake_data,
                distributed_eval=True,
                count=eval_steps * FLAGS.eval_batch_size)
            runner.initialize(input_fn, eval_input_fn, ssd_model.ssd_model_fn,
                              params)
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            runner.train_and_eval(train_steps)
            runner.shutdown()
            return

        current_step = 0
        threads = []
        for eval_step in ssd_constants.EVAL_STEPS:
            # Compute the actual eval steps based on the actural train_batch_size
            steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE /
                        FLAGS.train_batch_size)
            current_epoch = current_step // params['steps_per_epoch']

            tf.logging.info('Starting training cycle for %d steps.' % steps)
            if params['train_with_low_level_api']:
                trunner.train(steps, current_step)
            else:
                run_config, params = construct_run_config(steps)
                if FLAGS.device == 'gpu':
                    train_params = dict(params)
                    train_params['batch_size'] = FLAGS.train_batch_size
                    train_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=train_params)
                else:
                    train_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        config=run_config,
                        params=params)

                tf.logging.info(params)
                train_estimator.train(input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                                      steps=steps)

            if SUCCESS:
                break

            current_step = current_step + steps
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('Starting evaluation cycle at step %d.' %
                            current_step)
            # Run evaluation at the given step.
            if params['eval_with_low_level_api']:
                # TODO(b/123313070): Fix convergence discrepency
                # for train and distributed eval on POD with low level API.
                predictions = list(erunner.predict())
            else:
                if FLAGS.device == 'gpu':
                    eval_params = dict(params)
                    eval_params['batch_size'] = FLAGS.eval_batch_size
                    eval_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=eval_params)
                else:
                    eval_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        predict_batch_size=FLAGS.eval_batch_size,
                        config=run_config,
                        params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            t = threading.Thread(target=coco_eval,
                                 args=(predictions, current_epoch,
                                       current_step, summary_writer, coco_gt,
                                       params['use_cocoeval_cc'],
                                       params['nms_on_tpu']))
            threads.append(t)
            t.start()

        if params['train_with_low_level_api']:
            trunner.shutdown()

        for t in threads:
            t.join()

        summary_writer.close()

    elif FLAGS.mode == 'eval':
        if not params['eval_with_low_level_api']:
            if FLAGS.device == 'gpu':
                eval_params = dict(params)
                eval_params['batch_size'] = FLAGS.eval_batch_size
                eval_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=eval_params)
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        eval_epochs = [
            steps * ssd_constants.DEFAULT_BATCH_SIZE /
            FLAGS.train_batch_size // params['steps_per_epoch']
            for steps in eval_steps
        ]

        # For 8x8 slices and above.
        if FLAGS.train_batch_size >= 4096:
            eval_epochs = [i * 2 for i in eval_epochs]

        tf.logging.info('Eval epochs: %s' % eval_epochs)
        # Run evaluation when there's a new checkpoint
        threads = []
        for ckpt in next_checkpoint(FLAGS.model_dir):
            if SUCCESS:
                break
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('current epoch: %s' % current_epoch)
            if not params[
                    'eval_every_checkpoint'] and current_epoch not in eval_epochs:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                t = threading.Thread(target=coco_eval,
                                     args=(predictions, current_epoch,
                                           current_step, summary_writer,
                                           coco_gt))
                threads.append(t)
                t.start()

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

        for t in threads:
            t.join()

        summary_writer.close()
    elif FLAGS.mode == 'eval_once':
        if not params['eval_with_low_level_api']:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        # Run evaluation when there's a new checkpoint
        for ckpt in next_checkpoint(FLAGS.model_dir):
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            print('current epoch: %s' % current_epoch)
            if FLAGS.eval_epoch < current_epoch:
                break
            if FLAGS.eval_epoch > current_epoch:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                coco_eval(predictions, current_step, summary_writer, coco_gt,
                          params['use_cocoeval_cc'], params['nms_on_tpu'])

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                print('Checkpoint %s no longer exists, skipping checkpoint' %
                      ckpt)

        print('%d ending' % FLAGS.eval_epoch)
        summary_writer.close()