def eval_multiprocessing(predictions,
                         eval_metric,
                         eval_worker_count,
                         queue_size=mask_rcnn_params.QUEUE_SIZE):
    """Enables multiprocessing to update eval metrics."""
    # copybara:strip_begin
    q_in, q_out = REDACTEDprocess.get_user_data()
    processes = [
        REDACTEDprocess.Process(target=REDACTED_post_processing)
        for _ in range(eval_worker_count)
    ]
    # copybara:strip_end_and_replace_begin
    # q_in = multiprocessing.Queue(maxsize=queue_size)
    # q_out = multiprocessing.Queue(maxsize=queue_size)
    # processes = [
    #     multiprocessing.Process(target=post_processing, args=(q_in, q_out))
    #     for _ in range(eval_worker_count)
    # ]
    # copybara:replace_end
    for p in processes:
        p.start()

    # TODO(b/129410706): investigate whether threading improves speed.
    # Every predictor.next() gets a batch of prediction (a dictionary).
    exited_process = 0
    samples = len(predictions['detections']) // eval_worker_count
    for i in range(eval_worker_count):
        while q_in.full() or q_out.qsize() > queue_size // 4:
            exited_process = update_eval_metric(q_out, eval_metric,
                                                exited_process)

        q_in.put((predictions['detections'][i * samples:(i + 1) * samples],
                  predictions['mask_outputs'][i * samples:(i + 1) * samples],
                  predictions['image_info'][i * samples:(i + 1) * samples]))

    # Adds empty items to signal the children to quit.
    for _ in processes:
        q_in.put((None, None, None))

    # Cleans up q_out and waits for all the processes to finish work.
    while not q_out.empty() or exited_process < eval_worker_count:
        exited_process = update_eval_metric(q_out, eval_metric, exited_process)

    for p in processes:
        # actively terminate all processes (to work around the multiprocessing
        # deadlock issue in Cloud)
        # copybara:insert p.terminate()
        p.join()
Esempio n. 2
0
    mlp_log.mlperf_print(
        'eval_accuracy',
        {'BBOX': float(eval_results['AP']),
         'SEGM': float(eval_results['mask_AP'])},
        metadata={'epoch_num': cur_epoch + 1})
    mlp_log.mlperf_print(
        'eval_stop', None, metadata={'epoch_num': cur_epoch + 1})
    if (eval_results['AP'] >= mask_rcnn_params.BOX_EVAL_TARGET and
        eval_results['mask_AP'] >= mask_rcnn_params.MASK_EVAL_TARGET):
      mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
      return True
    return False

  def run_finish_fn(success):
    if not success:
      mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'})

  runner.train_and_eval(eval_init_fn, eval_finish_fn, run_finish_fn)


if __name__ == '__main__':
  # copybara:strip_begin
  user_data = (multiprocessing.Queue(maxsize=mask_rcnn_params.QUEUE_SIZE),
               multiprocessing.Queue(maxsize=mask_rcnn_params.QUEUE_SIZE))
  with REDACTEDprocess.main_handler(user_data=user_data):
    tf.logging.set_verbosity(tf.logging.INFO)
    app.run(main)
  # copybara:strip_end
  # copybara:insert tf.logging.set_verbosity(tf.logging.INFO)
  # copybara:insert app.run(main)
def REDACTED_post_processing():
    """REDACTED batch-processes the predictions."""
    q_in, q_out = REDACTEDprocess.get_user_data()
    post_processing(q_in, q_out)
Esempio n. 4
0
def main(argv):
    del argv  # Unused.

    params = construct_run_config(FLAGS.iterations_per_loop)
    mlp_log.mlperf_print(key='cache_clear', value=True)
    mlp_log.mlperf_print(key='init_start', value=None)
    mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size)
    mlp_log.mlperf_print('opt_base_learning_rate',
                         params['base_learning_rate'])
    mlp_log.mlperf_print(
        'opt_learning_rate_decay_boundary_epochs',
        [params['first_lr_drop_epoch'], params['second_lr_drop_epoch']])
    mlp_log.mlperf_print('opt_weight_decay', params['weight_decay'])
    mlp_log.mlperf_print(
        'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards *
        params['distributed_group_size'])
    mlp_log.mlperf_print('max_samples', ssd_constants.NUM_CROP_PASSES)
    mlp_log.mlperf_print('train_samples', FLAGS.num_examples_per_epoch)
    mlp_log.mlperf_print('eval_samples', FLAGS.eval_samples)

    params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards
    input_partition_dims = FLAGS.input_partition_dims
    train_steps = FLAGS.num_epochs * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size
    eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size))
    runner = train_and_eval_runner.TrainAndEvalRunner(
        FLAGS.iterations_per_loop, train_steps, eval_steps, FLAGS.num_shards)

    train_input_fn = dataloader.SSDInputReader(
        FLAGS.training_file_pattern,
        params['transpose_input'],
        is_training=True,
        use_fake_data=FLAGS.use_fake_data,
        params=params)
    eval_input_fn = dataloader.SSDInputReader(
        FLAGS.validation_file_pattern,
        is_training=False,
        use_fake_data=FLAGS.use_fake_data,
        distributed_eval=True,
        count=eval_steps * FLAGS.eval_batch_size,
        params=params)

    def init_fn():
        tf.train.init_from_checkpoint(
            params['resnet_checkpoint'], {
                'resnet/': 'resnet%s/' % ssd_constants.RESNET_DEPTH,
            })

    runner.initialize(train_input_fn, eval_input_fn,
                      functools.partial(ssd_model.ssd_model_fn,
                                        params), FLAGS.train_batch_size,
                      FLAGS.eval_batch_size, input_partition_dims, init_fn)
    mlp_log.mlperf_print('init_stop', None)
    mlp_log.mlperf_print('run_start', None)

    if FLAGS.run_cocoeval:
        # copybara:strip_begin
        q_in, q_out = REDACTEDprocess.get_user_data()
        processes = [
            REDACTEDprocess.Process(target=REDACTED_predict_post_processing)
            for _ in range(4)
        ]
        # copybara:strip_end_and_replace_begin
        # q_in = multiprocessing.Queue(maxsize=ssd_constants.QUEUE_SIZE)
        # q_out = multiprocessing.Queue(maxsize=ssd_constants.QUEUE_SIZE)
        # processes = [
        #     multiprocessing.Process(
        #         target=predict_post_processing, args=(q_in, q_out))
        #     for _ in range(self.num_multiprocessing_workers)
        # ]
        # copybara:replace_end
        for p in processes:
            p.start()

        def log_eval_results_fn():
            """Print out MLPerf log."""
            result = q_out.get()
            success = False
            while result[0] != _STOP:
                if not success:
                    steps_per_epoch = (FLAGS.num_examples_per_epoch //
                                       FLAGS.train_batch_size)
                    epoch = (result[0] +
                             FLAGS.iterations_per_loop) // steps_per_epoch
                    mlp_log.mlperf_print('eval_accuracy',
                                         result[1]['COCO/AP'],
                                         metadata={'epoch_num': epoch})
                    mlp_log.mlperf_print('eval_stop',
                                         None,
                                         metadata={'epoch_num': epoch})
                    if result[1]['COCO/AP'] > ssd_constants.EVAL_TARGET:
                        success = True
                        mlp_log.mlperf_print('run_stop',
                                             None,
                                             metadata={'status': 'success'})
                result = q_out.get()
            if not success:
                mlp_log.mlperf_print('run_stop',
                                     None,
                                     metadata={'status': 'abort'})

        log_eval_result_thread = threading.Thread(target=log_eval_results_fn)
        log_eval_result_thread.start()

    def eval_init_fn(cur_step):
        """Executed before every eval."""
        steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.train_batch_size
        epoch = cur_step // steps_per_epoch
        mlp_log.mlperf_print('block_start',
                             None,
                             metadata={
                                 'first_epoch_num':
                                 epoch,
                                 'epoch_count':
                                 FLAGS.iterations_per_loop // steps_per_epoch
                             })
        mlp_log.mlperf_print('eval_start',
                             None,
                             metadata={
                                 'epoch_num':
                                 epoch +
                                 FLAGS.iterations_per_loop // steps_per_epoch
                             })

    def eval_finish_fn(cur_step, eval_output, _):
        steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.train_batch_size
        epoch = cur_step // steps_per_epoch
        mlp_log.mlperf_print('block_stop',
                             None,
                             metadata={
                                 'first_epoch_num':
                                 epoch,
                                 'epoch_count':
                                 FLAGS.iterations_per_loop // steps_per_epoch
                             })
        if FLAGS.run_cocoeval:
            q_in.put((cur_step, eval_output['detections']))

    runner.train_and_eval(eval_init_fn, eval_finish_fn)

    if FLAGS.run_cocoeval:
        for _ in processes:
            q_in.put((_STOP, None))

        for p in processes:
            try:
                p.join(timeout=10)
            except Exception:  #  pylint: disable=broad-except
                pass

        q_out.put((_STOP, None))
        log_eval_result_thread.join()

        # Clear out all the queues to avoid deadlock.
        while not q_out.empty():
            q_out.get()
        while not q_in.empty():
            q_in.get()