def eval_multiprocessing(predictions, eval_metric, eval_worker_count, queue_size=mask_rcnn_params.QUEUE_SIZE): """Enables multiprocessing to update eval metrics.""" # copybara:strip_begin q_in, q_out = REDACTEDprocess.get_user_data() processes = [ REDACTEDprocess.Process(target=REDACTED_post_processing) for _ in range(eval_worker_count) ] # copybara:strip_end_and_replace_begin # q_in = multiprocessing.Queue(maxsize=queue_size) # q_out = multiprocessing.Queue(maxsize=queue_size) # processes = [ # multiprocessing.Process(target=post_processing, args=(q_in, q_out)) # for _ in range(eval_worker_count) # ] # copybara:replace_end for p in processes: p.start() # TODO(b/129410706): investigate whether threading improves speed. # Every predictor.next() gets a batch of prediction (a dictionary). exited_process = 0 samples = len(predictions['detections']) // eval_worker_count for i in range(eval_worker_count): while q_in.full() or q_out.qsize() > queue_size // 4: exited_process = update_eval_metric(q_out, eval_metric, exited_process) q_in.put((predictions['detections'][i * samples:(i + 1) * samples], predictions['mask_outputs'][i * samples:(i + 1) * samples], predictions['image_info'][i * samples:(i + 1) * samples])) # Adds empty items to signal the children to quit. for _ in processes: q_in.put((None, None, None)) # Cleans up q_out and waits for all the processes to finish work. while not q_out.empty() or exited_process < eval_worker_count: exited_process = update_eval_metric(q_out, eval_metric, exited_process) for p in processes: # actively terminate all processes (to work around the multiprocessing # deadlock issue in Cloud) # copybara:insert p.terminate() p.join()
mlp_log.mlperf_print( 'eval_accuracy', {'BBOX': float(eval_results['AP']), 'SEGM': float(eval_results['mask_AP'])}, metadata={'epoch_num': cur_epoch + 1}) mlp_log.mlperf_print( 'eval_stop', None, metadata={'epoch_num': cur_epoch + 1}) if (eval_results['AP'] >= mask_rcnn_params.BOX_EVAL_TARGET and eval_results['mask_AP'] >= mask_rcnn_params.MASK_EVAL_TARGET): mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'}) return True return False def run_finish_fn(success): if not success: mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'}) runner.train_and_eval(eval_init_fn, eval_finish_fn, run_finish_fn) if __name__ == '__main__': # copybara:strip_begin user_data = (multiprocessing.Queue(maxsize=mask_rcnn_params.QUEUE_SIZE), multiprocessing.Queue(maxsize=mask_rcnn_params.QUEUE_SIZE)) with REDACTEDprocess.main_handler(user_data=user_data): tf.logging.set_verbosity(tf.logging.INFO) app.run(main) # copybara:strip_end # copybara:insert tf.logging.set_verbosity(tf.logging.INFO) # copybara:insert app.run(main)
def REDACTED_post_processing(): """REDACTED batch-processes the predictions.""" q_in, q_out = REDACTEDprocess.get_user_data() post_processing(q_in, q_out)
def main(argv): del argv # Unused. params = construct_run_config(FLAGS.iterations_per_loop) mlp_log.mlperf_print(key='cache_clear', value=True) mlp_log.mlperf_print(key='init_start', value=None) mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size) mlp_log.mlperf_print('opt_base_learning_rate', params['base_learning_rate']) mlp_log.mlperf_print( 'opt_learning_rate_decay_boundary_epochs', [params['first_lr_drop_epoch'], params['second_lr_drop_epoch']]) mlp_log.mlperf_print('opt_weight_decay', params['weight_decay']) mlp_log.mlperf_print( 'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards * params['distributed_group_size']) mlp_log.mlperf_print('max_samples', ssd_constants.NUM_CROP_PASSES) mlp_log.mlperf_print('train_samples', FLAGS.num_examples_per_epoch) mlp_log.mlperf_print('eval_samples', FLAGS.eval_samples) params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards input_partition_dims = FLAGS.input_partition_dims train_steps = FLAGS.num_epochs * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size)) runner = train_and_eval_runner.TrainAndEvalRunner( FLAGS.iterations_per_loop, train_steps, eval_steps, FLAGS.num_shards) train_input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data, params=params) eval_input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, distributed_eval=True, count=eval_steps * FLAGS.eval_batch_size, params=params) def init_fn(): tf.train.init_from_checkpoint( params['resnet_checkpoint'], { 'resnet/': 'resnet%s/' % ssd_constants.RESNET_DEPTH, }) runner.initialize(train_input_fn, eval_input_fn, functools.partial(ssd_model.ssd_model_fn, params), FLAGS.train_batch_size, FLAGS.eval_batch_size, input_partition_dims, init_fn) mlp_log.mlperf_print('init_stop', None) mlp_log.mlperf_print('run_start', None) if FLAGS.run_cocoeval: # copybara:strip_begin q_in, q_out = REDACTEDprocess.get_user_data() processes = [ REDACTEDprocess.Process(target=REDACTED_predict_post_processing) for _ in range(4) ] # copybara:strip_end_and_replace_begin # q_in = multiprocessing.Queue(maxsize=ssd_constants.QUEUE_SIZE) # q_out = multiprocessing.Queue(maxsize=ssd_constants.QUEUE_SIZE) # processes = [ # multiprocessing.Process( # target=predict_post_processing, args=(q_in, q_out)) # for _ in range(self.num_multiprocessing_workers) # ] # copybara:replace_end for p in processes: p.start() def log_eval_results_fn(): """Print out MLPerf log.""" result = q_out.get() success = False while result[0] != _STOP: if not success: steps_per_epoch = (FLAGS.num_examples_per_epoch // FLAGS.train_batch_size) epoch = (result[0] + FLAGS.iterations_per_loop) // steps_per_epoch mlp_log.mlperf_print('eval_accuracy', result[1]['COCO/AP'], metadata={'epoch_num': epoch}) mlp_log.mlperf_print('eval_stop', None, metadata={'epoch_num': epoch}) if result[1]['COCO/AP'] > ssd_constants.EVAL_TARGET: success = True mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'}) result = q_out.get() if not success: mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'}) log_eval_result_thread = threading.Thread(target=log_eval_results_fn) log_eval_result_thread.start() def eval_init_fn(cur_step): """Executed before every eval.""" steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.train_batch_size epoch = cur_step // steps_per_epoch mlp_log.mlperf_print('block_start', None, metadata={ 'first_epoch_num': epoch, 'epoch_count': FLAGS.iterations_per_loop // steps_per_epoch }) mlp_log.mlperf_print('eval_start', None, metadata={ 'epoch_num': epoch + FLAGS.iterations_per_loop // steps_per_epoch }) def eval_finish_fn(cur_step, eval_output, _): steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.train_batch_size epoch = cur_step // steps_per_epoch mlp_log.mlperf_print('block_stop', None, metadata={ 'first_epoch_num': epoch, 'epoch_count': FLAGS.iterations_per_loop // steps_per_epoch }) if FLAGS.run_cocoeval: q_in.put((cur_step, eval_output['detections'])) runner.train_and_eval(eval_init_fn, eval_finish_fn) if FLAGS.run_cocoeval: for _ in processes: q_in.put((_STOP, None)) for p in processes: try: p.join(timeout=10) except Exception: # pylint: disable=broad-except pass q_out.put((_STOP, None)) log_eval_result_thread.join() # Clear out all the queues to avoid deadlock. while not q_out.empty(): q_out.get() while not q_in.empty(): q_in.get()