Ejemplo n.º 1
0
def coco_eval(predictions, current_epoch, current_step, summary_writer):
    """Call the coco library to get the eval metrics."""
    global SUCCESS
    eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file)
    mlperf_log.ssd_print(key=mlperf_log.EVAL_STOP, value=current_epoch)
    mlperf_log.ssd_print(key=mlperf_log.EVAL_SIZE, value=FLAGS.eval_samples)
    mlperf_log.ssd_print(key=mlperf_log.EVAL_ACCURACY,
                         value={
                             'epoch': current_epoch,
                             'value': eval_results['COCO/AP']
                         })
    mlperf_log.ssd_print(key=mlperf_log.EVAL_TARGET,
                         value=ssd_constants.EVAL_TARGET)
    mlperf_log.ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY,
                         value={
                             'iteration': current_step,
                             'value': eval_results['COCO/AP']
                         })
    print("The coco AP is: {}\n".format(eval_results['COCO/AP']))
    if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS:
        mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                             value={'success': 'true'})
        mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
        SUCCESS = True
    tf.logging.info('Eval results: %s' % eval_results)
    # Write out eval results for the checkpoint.
    with tf.Graph().as_default():
        summaries = []
        for metric in eval_results:
            summaries.append(
                tf.Summary.Value(tag=metric,
                                 simple_value=eval_results[metric]))
        tf_summary = tf.Summary(value=list(summaries))
        summary_writer.add_summary(tf_summary, current_step)
def predict_post_processing(q_in, q_out):
    """Run post-processing on CPU for predictions."""
    coco_gt = coco_metric.create_coco(FLAGS.val_json_file,
                                      use_cpp_extension=True)

    current_step, predictions = q_in.get()
    while current_step != _STOP and q_out is not None:
        tf.logging.info("Start to predict for step %d.", current_step)
        q_out.put((current_step,
                   coco_metric.compute_map(predictions,
                                           coco_gt,
                                           use_cpp_extension=True,
                                           nms_on_tpu=True)))
        current_step, predictions = q_in.get()
Ejemplo n.º 3
0
  def postprocess(self, results):
    """Postprocess results returned from model."""
    try:
      import coco_metric  # pylint: disable=g-import-not-at-top
    except ImportError:
      raise ImportError('To use the COCO dataset, you must clone the '
                        'repo https://github.com/tensorflow/models and add '
                        'tensorflow/models and tensorflow/models/research to '
                        'the PYTHONPATH, and compile the protobufs by '
                        'following https://github.com/tensorflow/models/blob/'
                        'master/research/object_detection/g3doc/installation.md'
                        '#protobuf-compilation ; To evaluate using COCO'
                        'metric, download and install Python COCO API from'
                        'https://github.com/cocodataset/cocoapi')

    pred_boxes = results[ssd_constants.PRED_BOXES]
    pred_scores = results[ssd_constants.PRED_SCORES]
    # TODO(haoyuzhang): maybe use these values for visualization.
    # gt_boxes = results['gt_boxes']
    # gt_classes = results['gt_classes']
    source_id = results[ssd_constants.SOURCE_ID]
    raw_shape = results[ssd_constants.RAW_SHAPE]

    for i in range(self.get_batch_size()):
      self.predictions[int(source_id[i])] = {
          ssd_constants.PRED_BOXES: pred_boxes[i],
          ssd_constants.PRED_SCORES: pred_scores[i],
          ssd_constants.SOURCE_ID: source_id[i],
          ssd_constants.RAW_SHAPE: raw_shape[i]
      }

    # COCO metric calculates mAP only after a full epoch of evaluation. Return
    # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
    if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
      annotation_file = os.path.join(self.params.data_dir,
                                     ssd_constants.ANNOTATION_FILE)
      eval_results = coco_metric.compute_map(self.predictions.values(),
                                             annotation_file)
      ret = {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
      for metric_key, metric_value in eval_results.items():
        ret['simple_value:' + metric_key] = metric_value
      return ret
    log_fn('Got {:d} out of {:d} eval examples.'
           ' Waiting for the remaining to calculate mAP...'.format(
               len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
    return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
Ejemplo n.º 4
0
def coco_eval(predictions,
              current_step,
              summary_writer,
              coco_gt,
              use_cpp_extension=True,
              nms_on_tpu=True):
    """Call the coco library to get the eval metrics."""
    global SUCCESS
    eval_results = coco_metric.compute_map(predictions,
                                           coco_gt,
                                           use_cpp_extension=use_cpp_extension,
                                           nms_on_tpu=nms_on_tpu)
    if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS:
        SUCCESS = True
    tf.logging.info('Eval results: %s' % eval_results)
    # Write out eval results for the checkpoint.
    with tf.Graph().as_default():
        summaries = []
        for metric in eval_results:
            summaries.append(
                tf.Summary.Value(tag=metric,
                                 simple_value=eval_results[metric]))
        tf_summary = tf.Summary(value=list(summaries))
        summary_writer.add_summary(tf_summary, current_step)
Ejemplo n.º 5
0
    def postprocess(self, results):
        """Postprocess results returned from model."""
        try:
            import coco_metric  # pylint: disable=g-import-not-at-top
        except ImportError:
            raise ImportError(
                'To use the COCO dataset, you must clone the '
                'repo https://github.com/tensorflow/models and add '
                'tensorflow/models and tensorflow/models/research to '
                'the PYTHONPATH, and compile the protobufs by '
                'following https://github.com/tensorflow/models/blob/'
                'master/research/object_detection/g3doc/installation.md'
                '#protobuf-compilation ; To evaluate using COCO'
                'metric, download and install Python COCO API from'
                'https://github.com/cocodataset/cocoapi')

        pred_boxes = results[ssd_constants.PRED_BOXES]
        pred_scores = results[ssd_constants.PRED_SCORES]
        # TODO(haoyuzhang): maybe use these values for visualization.
        # gt_boxes = results['gt_boxes']
        # gt_classes = results['gt_classes']
        source_id = results[ssd_constants.SOURCE_ID]
        raw_shape = results[ssd_constants.RAW_SHAPE]

        # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
        # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
        # `num_eval_epochs` to 1 is not enough and will often miss some images. We
        # expect user to set `num_eval_epochs` to >1, which will leave some unused
        # images from previous steps in `predictions`. Here we check if we are doing
        # eval at a new global step.
        if results['global_step'] > self.eval_global_step:
            self.eval_global_step = results['global_step']
            self.predictions.clear()

        for i, sid in enumerate(source_id):
            self.predictions[int(sid)] = {
                ssd_constants.PRED_BOXES: pred_boxes[i],
                ssd_constants.PRED_SCORES: pred_scores[i],
                ssd_constants.SOURCE_ID: source_id[i],
                ssd_constants.RAW_SHAPE: raw_shape[i]
            }

        # COCO metric calculates mAP only after a full epoch of evaluation. Return
        # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
        if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
            log_fn('Got results for all {:d} eval examples. Calculate mAP...'.
                   format(ssd_constants.COCO_NUM_VAL_IMAGES))
            annotation_file = os.path.join(self.params.data_dir,
                                           ssd_constants.ANNOTATION_FILE)
            eval_results = coco_metric.compute_map(self.predictions.values(),
                                                   annotation_file)
            self.predictions.clear()
            ret = {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
            for metric_key, metric_value in eval_results.items():
                ret[constants.SIMPLE_VALUE_RESULT_PREFIX +
                    metric_key] = metric_value
            return ret
        log_fn('Got {:d} out of {:d} eval examples.'
               ' Waiting for the remaining to calculate mAP...'.format(
                   len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
        return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
Ejemplo n.º 6
0
  def postprocess(self, results):
    """Postprocess results returned from model."""
    try:
      import coco_metric  # pylint: disable=g-import-not-at-top
    except ImportError:
      raise ImportError('To use the COCO dataset, you must clone the '
                        'repo https://github.com/tensorflow/models and add '
                        'tensorflow/models and tensorflow/models/research to '
                        'the PYTHONPATH, and compile the protobufs by '
                        'following https://github.com/tensorflow/models/blob/'
                        'master/research/object_detection/g3doc/installation.md'
                        '#protobuf-compilation ; To evaluate using COCO'
                        'metric, download and install Python COCO API from'
                        'https://github.com/cocodataset/cocoapi')

    pred_boxes = results[ssd_constants.PRED_BOXES]
    pred_scores = results[ssd_constants.PRED_SCORES]
    # TODO(haoyuzhang): maybe use these values for visualization.
    # gt_boxes = results['gt_boxes']
    # gt_classes = results['gt_classes']
    source_id = results[ssd_constants.SOURCE_ID]
    raw_shape = results[ssd_constants.RAW_SHAPE]

    # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
    # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
    # `num_eval_epochs` to 1 is not enough and will often miss some images. We
    # expect user to set `num_eval_epochs` to >1, which will leave some unused
    # images from previous steps in `predictions`. Here we check if we are doing
    # eval at a new global step.
    if results['global_step'] > self.eval_global_step:
      self.eval_global_step = results['global_step']
      self.predictions.clear()

    for i, sid in enumerate(source_id):
      self.predictions[int(sid)] = {
          ssd_constants.PRED_BOXES: pred_boxes[i],
          ssd_constants.PRED_SCORES: pred_scores[i],
          ssd_constants.SOURCE_ID: source_id[i],
          ssd_constants.RAW_SHAPE: raw_shape[i]
      }

    # COCO metric calculates mAP only after a full epoch of evaluation. Return
    # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
    if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
      log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format(
          ssd_constants.COCO_NUM_VAL_IMAGES))

      annotation_file = os.path.join(self.params.data_dir,
                                     ssd_constants.ANNOTATION_FILE)
      # Size of predictions before decoding about 15--30GB, while size after
      # decoding is 100--200MB. When using async eval mode, decoding takes
      # 20--30 seconds of main thread time but is necessary to avoid OOM during
      # inter-process communication.
      decoded_preds = coco_metric.decode_predictions(self.predictions.values())
      self.predictions.clear()

      if self.params.collect_eval_results_async:
        def _eval_results_getter():
          """Iteratively get eval results from async eval process."""
          while True:
            step, eval_results = self.async_eval_results_queue.get()
            self.eval_coco_ap = eval_results['COCO/AP']
            mlperf.logger.log_eval_accuracy(
                self.eval_coco_ap, step, self.batch_size * self.params.num_gpus,
                ssd_constants.COCO_NUM_TRAIN_IMAGES)
            if self.reached_target():
              # Reached target, clear all pending messages in predictions queue
              # and insert poison pill to stop the async eval process.
              while not self.async_eval_predictions_queue.empty():
                self.async_eval_predictions_queue.get()
              self.async_eval_predictions_queue.put('STOP')
              break

        if not self.async_eval_process:
          # Limiting the number of messages in predictions queue to prevent OOM.
          # Each message (predictions data) can potentially consume a lot of
          # memory, and normally there should only be few messages in the queue.
          # If often blocked on this, consider reducing eval frequency.
          self.async_eval_predictions_queue = multiprocessing.Queue(2)
          self.async_eval_results_queue = multiprocessing.Queue()

          # Reason to use a Process as opposed to Thread is mainly the
          # computationally intensive eval runner. Python multithreading is not
          # truly running in parallel, a runner thread would get significantly
          # delayed (or alternatively delay the main thread).
          self.async_eval_process = multiprocessing.Process(
              target=coco_metric.async_eval_runner,
              args=(self.async_eval_predictions_queue,
                    self.async_eval_results_queue,
                    annotation_file))
          self.async_eval_process.daemon = True
          self.async_eval_process.start()

          self.async_eval_results_getter_thread = threading.Thread(
              target=_eval_results_getter, args=())
          self.async_eval_results_getter_thread.daemon = True
          self.async_eval_results_getter_thread.start()

        self.async_eval_predictions_queue.put(
            (self.eval_global_step, decoded_preds))
        return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}

      eval_results = coco_metric.compute_map(decoded_preds, annotation_file)
      self.eval_coco_ap = eval_results['COCO/AP']
      ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
      for metric_key, metric_value in eval_results.items():
        ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value
      mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step,
                                      self.batch_size * self.params.num_gpus,
                                      ssd_constants.COCO_NUM_TRAIN_IMAGES)
      return ret
    log_fn('Got {:d} out of {:d} eval examples.'
           ' Waiting for the remaining to calculate mAP...'.format(
               len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
    return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
Ejemplo n.º 7
0
def main(argv):
    del argv  # Unused.
    global SUCCESS
    print(FLAGS.model_dir)
    if FLAGS.model_dir: print(FLAGS.model_dir)
    else:
        print(FLAGS.training_file_pattern)
        raise Exception('No model dir')
    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    run_config, params = construct_run_config(FLAGS.iterations_per_loop)

    if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once':
        if params['train_with_low_level_api']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            trunner = train_low_level_runner.TrainLowLevelRunner(
                iterations=FLAGS.iterations_per_loop)
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            mlperf_log.ssd_print(key=mlperf_log.RUN_START)
            trunner.initialize(input_fn, ssd_model.ssd_model_fn, params)
        else:
            mlperf_log.ssd_print(key=mlperf_log.RUN_START)

    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if params['eval_with_low_level_api']:
            params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards
            erunner = eval_low_level_runner.EvalLowLevelRunner(
                eval_steps=int(FLAGS.eval_samples / FLAGS.eval_batch_size))
            input_fn = dataloader.SSDInputReader(
                FLAGS.validation_file_pattern,
                is_training=False,
                use_fake_data=FLAGS.use_fake_data)
            erunner.initialize(input_fn, params)
            erunner.build_model(ssd_model.ssd_model_fn, params)

    # TPU Estimator
    if FLAGS.mode == 'train':
        if params['train_with_low_level_api']:
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0)
            trunner.train(train_steps)
            trunner.shutdown()
        else:
            if FLAGS.device == 'gpu':
                params['dataset_num_shards'] = 1
                params['dataset_index'] = 0
                train_params = dict(params)
                train_params['batch_size'] = FLAGS.train_batch_size
                train_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=train_params)
            else:
                train_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    config=run_config,
                    params=params)

            tf.logging.info(params)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0)
            hooks = []

            if FLAGS.use_async_checkpoint:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            train_estimator.train(
                input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size),
                hooks=hooks)

        if FLAGS.eval_after_training:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

            predictions = list(
                eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                    FLAGS.validation_file_pattern,
                    is_training=False,
                    use_fake_data=FLAGS.use_fake_data)))

            eval_results = coco_metric.compute_map(predictions,
                                                   FLAGS.val_json_file)

            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'train_and_eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        current_step = 0
        mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
        threads = []
        for eval_step in ssd_constants.EVAL_STEPS:
            # Compute the actual eval steps based on the actural train_batch_size
            steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE /
                        FLAGS.train_batch_size)
            current_epoch = current_step // params['steps_per_epoch']
            # TODO(wangtao): figure out how to log for each epoch.
            mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH,
                                 value=current_epoch)

            tf.logging.info('Starting training cycle for %d steps.' % steps)
            if params['train_with_low_level_api']:
                trunner.train(steps)
            else:
                run_config, params = construct_run_config(steps)
                if FLAGS.device == 'gpu':
                    train_params = dict(params)
                    train_params['batch_size'] = FLAGS.train_batch_size
                    train_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=train_params)
                else:
                    train_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        config=run_config,
                        params=params)

                tf.logging.info(params)
                train_estimator.train(input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                                      steps=steps)

            if SUCCESS:
                break

            current_step = current_step + steps
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('Starting evaluation cycle at step %d.' %
                            current_step)
            mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                 value=current_epoch)
            # Run evaluation at the given step.
            if params['eval_with_low_level_api']:
                predictions = list(erunner.predict())
            else:
                if FLAGS.device == 'gpu':
                    eval_params = dict(params)
                    eval_params['batch_size'] = FLAGS.eval_batch_size
                    eval_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=eval_params)
                else:
                    eval_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        predict_batch_size=FLAGS.eval_batch_size,
                        config=run_config,
                        params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            t = threading.Thread(target=coco_eval,
                                 args=(predictions, current_epoch,
                                       current_step, summary_writer))
            threads.append(t)
            t.start()

        trunner.shutdown()

        for t in threads:
            t.join()

        # success is a string right now as boolean is not JSON serializable.
        if not SUCCESS:
            mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                 value={'success': 'false'})
            mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()

    elif FLAGS.mode == 'eval':
        if not params['eval_with_low_level_api']:
            if FLAGS.device == 'gpu':
                eval_params = dict(params)
                eval_params['batch_size'] = FLAGS.eval_batch_size
                eval_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=eval_params)
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        eval_epochs = [
            steps * ssd_constants.DEFAULT_BATCH_SIZE /
            FLAGS.train_batch_size // params['steps_per_epoch']
            for steps in eval_steps
        ]

        # For 8x8 slices and above.
        if FLAGS.train_batch_size >= 4096:
            eval_epochs = [i * 2 for i in eval_epochs]

        tf.logging.info('Eval epochs: %s' % eval_epochs)
        # Run evaluation when there's a new checkpoint
        threads = []
        count = 1
        for ckpt in next_checkpoint(FLAGS.model_dir):
            print("current count is {}\n".format(count))
            count += 1
            if SUCCESS:
                break
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('current step: %s' % current_step)
            tf.logging.info('current epoch: %s' % current_epoch)
            if not params[
                    'eval_every_checkpoint'] and current_epoch not in eval_epochs:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                     value=current_epoch)

                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                t = threading.Thread(target=coco_eval,
                                     args=(predictions, current_epoch,
                                           current_step, summary_writer))
                threads.append(t)
                t.start()

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

        for t in threads:
            t.join()

        if not SUCCESS:
            mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                 value={'success': 'false'})
            mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()
    elif FLAGS.mode == 'eval_once':
        if not params['eval_with_low_level_api']:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        # Run evaluation when there's a new checkpoint
        for ckpt in next_checkpoint(FLAGS.model_dir):
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            print('current epoch: %s' % current_epoch)
            if FLAGS.eval_epoch < current_epoch:
                break
            if FLAGS.eval_epoch > current_epoch:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                mlperf_log.ssd_print(key=mlperf_log.EVAL_START,
                                     value=current_epoch)

                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                coco_eval(predictions, current_epoch, current_step,
                          summary_writer)

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    if not SUCCESS:
                        mlperf_log.ssd_print(key=mlperf_log.RUN_STOP,
                                             value={'success': 'false'})
                        mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL)
                    print('Evaluation finished after training step %d' %
                          current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                print('Checkpoint %s no longer exists, skipping checkpoint' %
                      ckpt)

        print('%d ending' % FLAGS.eval_epoch)
        summary_writer.close()
Ejemplo n.º 8
0
def main(argv):
    del argv  # Unused.
    global SUCCESS

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    run_config, params = construct_run_config(FLAGS.iterations_per_loop)
    mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size)
    mlp_log.mlperf_print('opt_base_learning_rate',
                         params['base_learning_rate'])
    mlp_log.mlperf_print('opt_weight_decay', params['weight_decay'])
    mlp_log.mlperf_print(
        'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards *
        params['distributed_group_size'])

    if FLAGS.mode in ('eval', 'eval_once'):
        coco_gt = coco_metric.create_coco(
            FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])

    if FLAGS.mode == 'train_and_eval' and params[
            'in_memory_eval'] and FLAGS.train_batch_size != FLAGS.eval_batch_size:
        raise RuntimeError(
            'train batch size should be equal to eval batch size for in memory eval.'
        )

    if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once' and not params[
            'in_memory_eval']:
        if params['train_with_low_level_api'] and not params['in_memory_eval']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            input_partition_dims = FLAGS.input_partition_dims
            if input_partition_dims is not None and params['transpose_input']:
                if params['batch_size'] > 8:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 3, 0]
                    ]
                else:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 0, 3]
                    ]
            trunner = train_low_level_runner.TrainLowLevelRunner(
                input_partition_dims=[input_partition_dims, None]
                if FLAGS.input_partition_dims else None,
                num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims))
                if FLAGS.input_partition_dims else 1,
                iterations=FLAGS.iterations_per_loop,
            )
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            trunner.initialize(input_fn, ssd_model.ssd_model_fn, params)

    if params[
            'eval_with_low_level_api'] and FLAGS.mode != 'train' and not params[
                'in_memory_eval']:
        params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards
        eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size))
        if params['distributed_eval']:
            erunner = dist_eval_low_level_runner.DistEvalLowLevelRunner(
                eval_steps=eval_steps)
        else:
            erunner = eval_low_level_runner.EvalLowLevelRunner(
                eval_steps=eval_steps)
        input_fn = dataloader.SSDInputReader(
            FLAGS.validation_file_pattern,
            is_training=False,
            use_fake_data=FLAGS.use_fake_data,
            distributed_eval=params['distributed_eval'],
            count=eval_steps * FLAGS.eval_batch_size)
        erunner.initialize(input_fn, params)
        erunner.build_model(ssd_model.ssd_model_fn, params)

    # TPU Estimator
    if FLAGS.mode == 'train':
        if params['train_with_low_level_api']:
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            trunner.train(train_steps)
            trunner.shutdown()
        else:
            if FLAGS.device == 'gpu':
                train_params = dict(params)
                train_params['batch_size'] = FLAGS.train_batch_size
                train_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=train_params)
            else:
                train_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    config=run_config,
                    params=params)

            tf.logging.info(params)
            hooks = []
            if FLAGS.use_async_checkpoint:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            train_estimator.train(
                input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size),
                hooks=hooks)

        if FLAGS.eval_after_training:
            if params['eval_with_low_level_api']:
                predictions = list(erunner.predict())
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            eval_results = coco_metric.compute_map(
                predictions,
                coco_gt,
                use_cpp_extension=params['use_cocoeval_cc'],
                nms_on_tpu=params['nms_on_tpu'])

            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'train_and_eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        if params['in_memory_eval']:
            params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
            eval_steps = int(
                math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size))
            input_partition_dims = FLAGS.input_partition_dims
            if input_partition_dims is not None and params['transpose_input']:
                if params['batch_size'] > 8:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 3, 0]
                    ]
                else:
                    input_partition_dims = [
                        input_partition_dims[i] for i in [1, 2, 0, 3]
                    ]
            runner = train_and_eval_low_level_runner.TrainAndEvalLowLevelRunner(
                iterations=FLAGS.iterations_per_loop,
                eval_steps=eval_steps,
                input_partition_dims=input_partition_dims
                if FLAGS.input_partition_dims else None,
                num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims))
                if FLAGS.input_partition_dims else 1,
            )
            input_fn = dataloader.SSDInputReader(
                FLAGS.training_file_pattern,
                params['transpose_input'],
                is_training=True,
                use_fake_data=FLAGS.use_fake_data)
            # Init for eval.
            eval_input_fn = dataloader.SSDInputReader(
                FLAGS.validation_file_pattern,
                is_training=False,
                use_fake_data=FLAGS.use_fake_data,
                distributed_eval=True,
                count=eval_steps * FLAGS.eval_batch_size)
            runner.initialize(input_fn, eval_input_fn, ssd_model.ssd_model_fn,
                              params)
            train_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            runner.train_and_eval(train_steps)
            runner.shutdown()
            return

        current_step = 0
        threads = []
        for eval_step in ssd_constants.EVAL_STEPS:
            # Compute the actual eval steps based on the actural train_batch_size
            steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE /
                        FLAGS.train_batch_size)
            current_epoch = current_step // params['steps_per_epoch']

            tf.logging.info('Starting training cycle for %d steps.' % steps)
            if params['train_with_low_level_api']:
                trunner.train(steps, current_step)
            else:
                run_config, params = construct_run_config(steps)
                if FLAGS.device == 'gpu':
                    train_params = dict(params)
                    train_params['batch_size'] = FLAGS.train_batch_size
                    train_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=train_params)
                else:
                    train_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        config=run_config,
                        params=params)

                tf.logging.info(params)
                train_estimator.train(input_fn=dataloader.SSDInputReader(
                    FLAGS.training_file_pattern,
                    params['transpose_input'],
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                                      steps=steps)

            if SUCCESS:
                break

            current_step = current_step + steps
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('Starting evaluation cycle at step %d.' %
                            current_step)
            # Run evaluation at the given step.
            if params['eval_with_low_level_api']:
                # TODO(b/123313070): Fix convergence discrepency
                # for train and distributed eval on POD with low level API.
                predictions = list(erunner.predict())
            else:
                if FLAGS.device == 'gpu':
                    eval_params = dict(params)
                    eval_params['batch_size'] = FLAGS.eval_batch_size
                    eval_estimator = tf.estimator.Estimator(
                        model_fn=ssd_model.ssd_model_fn,
                        model_dir=FLAGS.model_dir,
                        config=run_config,
                        params=eval_params)
                else:
                    eval_estimator = tpu_estimator.TPUEstimator(
                        model_fn=ssd_model.ssd_model_fn,
                        use_tpu=FLAGS.use_tpu,
                        train_batch_size=FLAGS.train_batch_size,
                        predict_batch_size=FLAGS.eval_batch_size,
                        config=run_config,
                        params=params)

                predictions = list(
                    eval_estimator.predict(input_fn=dataloader.SSDInputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        use_fake_data=FLAGS.use_fake_data)))

            t = threading.Thread(target=coco_eval,
                                 args=(predictions, current_epoch,
                                       current_step, summary_writer, coco_gt,
                                       params['use_cocoeval_cc'],
                                       params['nms_on_tpu']))
            threads.append(t)
            t.start()

        if params['train_with_low_level_api']:
            trunner.shutdown()

        for t in threads:
            t.join()

        summary_writer.close()

    elif FLAGS.mode == 'eval':
        if not params['eval_with_low_level_api']:
            if FLAGS.device == 'gpu':
                eval_params = dict(params)
                eval_params['batch_size'] = FLAGS.eval_batch_size
                eval_estimator = tf.estimator.Estimator(
                    model_fn=ssd_model.ssd_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=eval_params)
            else:
                eval_estimator = tpu_estimator.TPUEstimator(
                    model_fn=ssd_model.ssd_model_fn,
                    use_tpu=FLAGS.use_tpu,
                    train_batch_size=FLAGS.train_batch_size,
                    predict_batch_size=FLAGS.eval_batch_size,
                    config=run_config,
                    params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        eval_epochs = [
            steps * ssd_constants.DEFAULT_BATCH_SIZE /
            FLAGS.train_batch_size // params['steps_per_epoch']
            for steps in eval_steps
        ]

        # For 8x8 slices and above.
        if FLAGS.train_batch_size >= 4096:
            eval_epochs = [i * 2 for i in eval_epochs]

        tf.logging.info('Eval epochs: %s' % eval_epochs)
        # Run evaluation when there's a new checkpoint
        threads = []
        for ckpt in next_checkpoint(FLAGS.model_dir):
            if SUCCESS:
                break
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            tf.logging.info('current epoch: %s' % current_epoch)
            if not params[
                    'eval_every_checkpoint'] and current_epoch not in eval_epochs:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                t = threading.Thread(target=coco_eval,
                                     args=(predictions, current_epoch,
                                           current_step, summary_writer,
                                           coco_gt))
                threads.append(t)
                t.start()

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

        for t in threads:
            t.join()

        summary_writer.close()
    elif FLAGS.mode == 'eval_once':
        if not params['eval_with_low_level_api']:
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=ssd_model.ssd_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=params)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        # Run evaluation when there's a new checkpoint
        for ckpt in next_checkpoint(FLAGS.model_dir):
            current_step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = current_step // params['steps_per_epoch']
            print('current epoch: %s' % current_epoch)
            if FLAGS.eval_epoch < current_epoch:
                break
            if FLAGS.eval_epoch > current_epoch:
                continue

            tf.logging.info('Starting to evaluate.')
            try:
                if params['eval_with_low_level_api']:
                    predictions = list(erunner.predict(checkpoint_path=ckpt))
                else:
                    predictions = list(
                        eval_estimator.predict(
                            checkpoint_path=ckpt,
                            input_fn=dataloader.SSDInputReader(
                                FLAGS.validation_file_pattern,
                                is_training=False,
                                use_fake_data=FLAGS.use_fake_data)))

                coco_eval(predictions, current_step, summary_writer, coco_gt,
                          params['use_cocoeval_cc'], params['nms_on_tpu'])

                # Terminate eval job when final checkpoint is reached
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long
                # after the CPU job tells it to start evaluating. In this case,
                # the checkpoint file could have been deleted already.
                print('Checkpoint %s no longer exists, skipping checkpoint' %
                      ckpt)

        print('%d ending' % FLAGS.eval_epoch)
        summary_writer.close()