def mlperf_test_early_exit(iteration, iters_per_epoch, tester, model, distributed, min_bbox_map, min_segm_map): # Note: let iters / epoch == 10k, at iter 9999 we've finished epoch 0 and need to test if iteration > 0 and (iteration + 1)% iters_per_epoch == 0: epoch = iteration // iters_per_epoch print_mlperf(key=mlperf_log.EVAL_START, value=epoch) #print("tester "+str(tester)) #print("model "+str(model)) bbox_map, segm_map = test_and_exchange_map(tester, model, distributed) # necessary for correctness model.train() print_mlperf(key=mlperf_log.EVAL_TARGET, value={"BBOX": min_bbox_map, "SEGM": min_segm_map}) logger = logging.getLogger('maskrcnn_benchmark.trainer') logger.info('bbox mAP: {}, segm mAP: {}'.format(bbox_map, segm_map)) print_mlperf(key=mlperf_log.EVAL_ACCURACY, value={"epoch" : epoch, "value":{"BBOX" : bbox_map, "SEGM" : segm_map}}) print_mlperf(key=mlperf_log.EVAL_STOP) # terminating condition if bbox_map >= min_bbox_map and segm_map >= min_segm_map: logger.info("Target mAP reached, exiting...") print_mlperf(key=mlperf_log.RUN_STOP, value={"success":True}) return True # At this point will start the next epoch, so note this in the log # print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch+1) return False
def make_data_loader(cfg, is_train=True, is_distributed=False): if is_train: images_per_batch = cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN print_mlperf(key=mlperf_log.INPUT_ORDER) shuffle = True else: images_per_batch = cfg.DATALOADER.IMAGES_PER_BATCH_TEST shuffle = False if not is_distributed else True aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] dataset = make_coco_dataset(cfg, is_train) sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_batch) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, pin_memory=True) return data_loader
def mlperf_log_epoch_start(iteration, iters_per_epoch): # First iteration: # Note we've started training & tag first epoch start if iteration == 0: print_mlperf(key=mlperf_log.TRAIN_LOOP) print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=0) return if iteration % iters_per_epoch == 0: epoch = iteration // iters_per_epoch print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch)
def main(): mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__)) parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if is_main_process: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = './MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() print_mlperf(key=mlperf_log.RUN_START) # setting seeds - needs to be timed, so after RUN_START if is_main_process(): master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) else: seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: print_mlperf(key=mlperf_log.RUN_START) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) # actually use the random seed args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds( random_number_generator, torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) print_mlperf(key=mlperf_log.RUN_FINAL)
def train(cfg, local_rank, distributed): # Model logging print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) else: per_iter_callback_fn = None start_train_time = time.time() do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, per_iter_start_callback_fn=functools.partial( mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print("&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format( (arguments["iteration"] * 1.0) / total_training_time)) return model
def train(cfg, random_number_generator, local_rank, distributed, args, fp16=False): data_loader = make_data_loader(cfg, is_train=True, is_distributed=distributed) # todo sharath - undocument log below after package is updated # print_mlperf(key=mlperf_log.INPUT_SIZE, value=len(data_loader.dataset)) print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.DATALOADER.IMAGES_PER_BATCH_TEST) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) model = build_detection_model(cfg) load_from_pretrained_checkpoint(cfg, model) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) max_iter = cfg.SOLVER.MAX_ITER if use_apex_amp: amp_handle = amp.init(enabled=fp16, verbose=False) if cfg.SOLVER.ACCUMULATE_GRAD: # also specify number of steps to accumulate over optimizer = amp_handle.wrap_optimizer( optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS) else: optimizer = amp_handle.wrap_optimizer(optimizer) if distributed: if use_apex_ddp: model = DDP(model, delay_allreduce=True) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) arguments = {} arguments["iteration"] = 0 arguments["use_amp"] = use_apex_amp output_dir = cfg.OUTPUT_DIR if cfg.SAVE_CHECKPOINTS: checkpoint_file = cfg.CHECKPOINT checkpointer = Checkpoint(model, optimizer, scheduler, output_dir, local_rank) if checkpoint_file: extra_checkpoint_data = checkpointer.load(checkpoint_file) arguments.update(extra_checkpoint_data) else: checkpointer = None do_train( model, data_loader, optimizer, scheduler, checkpointer, max_iter, device, distributed, arguments, cfg, args, random_number_generator, ) return model
def main(): mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__)) # mlperf_log.LOGGER.propagate = False parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch/configs/rpn_r50.py", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--skip-test', dest='skip_test', help='Do not test the model', action='store_true') parser.add_argument("--fp16", action="store_true", help="Enable multi-precision training") parser.add_argument("--min_bbox_map", type=float, default=0.377, help="Target BBOX MAP") parser.add_argument("--min_mask_map", type=float, default=0.339, help="Target SEGM/MASK MAP") parser.add_argument("--seed", type=int, default=1, help="Seed") parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.distributed = (int(os.environ["WORLD_SIZE"]) > 1 if "WORLD_SIZE" in os.environ else False) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") # to synchronize start of time torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() if torch.distributed.get_rank() == 0: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) print_mlperf(key=mlperf_log.RUN_START) # Setting seed seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda")) if torch.distributed.get_rank() == 0: # seed = int(time.time()) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda")) torch.distributed.broadcast(seed_tensor, 0) master_seed = int(seed_tensor.item()) else: # Setting logging file parameters for compliance logging os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str( datetime.datetime.now()) mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE") mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE) mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG) mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER) print_mlperf(key=mlperf_log.RUN_START) # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) args.seed = master_seed # random number generator with seed set to master_seed random_number_generator = random.Random(master_seed) print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) if args.skip_test: cfg.DO_ONLINE_MAP_EVAL = False cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) #logger = setup_logger("maskrcnn_benchmark", output_dir, args.local_rank) logger = setup_logger("maskrcnn_benchmark", None, args.local_rank) logger.info(args) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds( random_number_generator, torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) # todo sharath what if CPU # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device='cuda') # Setting worker seeds logger.info("Worker {}: Setting seed {}".format( args.local_rank, worker_seeds[args.local_rank])) torch.manual_seed(worker_seeds[args.local_rank]) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, random_number_generator, args.local_rank, args.distributed, args, args.fp16) print_mlperf(key=mlperf_log.RUN_FINAL)
def train(cfg, local_rank, distributed): # Model logging print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value = cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/modeling/detector/detectors.py # building bare mode without doing anthing model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR print("output_dir "+str(output_dir)) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) # no such SAVE_CHECKPOINTS #arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS arguments["save_checkpoints"] = False extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"] ) print("SSY iters_per_epoch "+str(iters_per_epoch)) #print("SSY iters_per_epoch change to 100 ") #iters_per_epoch = 100 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch # SSY # I already add PER_EPOCH_EVAL and MIN_BBOX_MAP MIN_SEGM_MAP to ./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml # but it still can not find it # so I manually set them here #if cfg.PER_EPOCH_EVAL: # per_iter_callback_fn = functools.partial( # mlperf_test_early_exit, # iters_per_epoch=iters_per_epoch, # tester=functools.partial(test, cfg=cfg), # model=model, # distributed=distributed, # min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, # min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) #else: # per_iter_callback_fn = None per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/tester.py tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=0.377, min_segm_map=0.339) start_train_time = time.time() # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/trainer.py do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print( "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time) ) return model
def do_train( model, data_loader, optimizer, scheduler, checkpointer, max_iter, device, use_distributed, arguments, config, args, random_number_generator, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") start_training_time = time.time() print_mlperf(key=mlperf_log.TRAIN_LOOP) epoch = 0 while arguments["iteration"] < max_iter: print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch) start_epoch_time = time.time() iteration = arguments["iteration"] if use_distributed: # Using Random number generator with master seed to generate random seeds for every epoch iteration_seed = random_number_generator.randint(0, 2**32 - 1) data_loader.batch_sampler.sampler.set_epoch(iteration_seed) iteration_end = train_one_epoch( model, data_loader, optimizer, scheduler, device, iteration, max_iter, config, use_distributed, use_amp=arguments["use_amp"] ) total_epoch_time = time.time() - start_epoch_time epoch_time_str = str(datetime.timedelta(seconds=total_epoch_time)) logger.info( "Total epoch time: {} ({:.4f} s / it)".format( epoch_time_str, total_epoch_time / (iteration_end - iteration) ) ) arguments["iteration"] = iteration_end if checkpointer: checkpointer("model_{}".format(arguments["iteration"]), **arguments) if config.DO_ONLINE_MAP_EVAL: print_mlperf(key=mlperf_log.EVAL_START, value=epoch) results = test(config, model, use_distributed) print_mlperf(key=mlperf_log.EVAL_TARGET, value={"BBOX": 0.377, "SEGM": 0.339}) map_tensor = torch.tensor((0, 0), dtype=torch.float32, device=torch.device("cuda")) if results: #Rank 0 process bbox_map = results['bbox'] mask_map = results['segm'] map_tensor = torch.tensor((bbox_map, mask_map), dtype=torch.float32, device=torch.device("cuda")) if use_distributed: torch.distributed.broadcast(map_tensor, 0) bbox_map = map_tensor[0].item() mask_map = map_tensor[1].item() logger.info("bbox map: {} mask map: {}".format(bbox_map, mask_map)) print_mlperf(key=mlperf_log.EVAL_ACCURACY, value={"epoch":epoch, "value":{"BBOX":bbox_map, "SEGM":mask_map}}) print_mlperf(key=mlperf_log.EVAL_STOP) # Terminating condition if bbox_map >= args.min_bbox_map and mask_map >= args.min_mask_map: logger.info("Target MAP reached. Exiting...") print_mlperf(key=mlperf_log.RUN_STOP, value={"success":True}) break epoch += 1 total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (arguments["iteration"]) ) ) logger.info( "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time) )
def inference( model, data_loader, iou_types=("bbox", ), box_only=False, device="cuda", expected_results=(), expected_results_sigma_tol=4, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = (torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset print_mlperf(key=mlperf_log.EVAL_SIZE, value=len(dataset)) logger.info("Start evaluation on {} images".format(len(dataset))) start_time = time.time() predictions = compute_on_dataset(model, data_loader, device) # wait for all processes to complete before measuring the time synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total inference time: {} ({} s / img per device, on {} devices)". format(total_time_str, total_time * num_devices / len(dataset), num_devices)) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not _is_main_process(): return for p in predictions: p.add_field('mask1', p.get_field('mask').float()) if box_only: logger.info("Evaluating bbox proposals") areas = {"all": "", "small": "s", "medium": "m", "large": "l"} res = COCOResults("box_proposal") for limit in [100, 1000]: for area, suffix in areas.items(): stats = evaluate_box_proposals(predictions, dataset, area=area, limit=limit) key = "AR{}@{:d}".format(suffix, limit) res.results["box_proposal"][key] = stats["ar"].item() # logger.info("AR@1000: {}".format(results["ar"].item())) logger.info(res) check_expected_results(res, expected_results, expected_results_sigma_tol) # return back result return res logger.info("Preparing results for COCO format") coco_results = {} if "bbox" in iou_types: logger.info("Preparing bbox results") coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) if "segm" in iou_types: logger.info("Preparing segm results") num_proc = 20 local_len = len(predictions) // num_proc chunks = [] for i in range(num_proc): chunks.append( (predictions[i * local_len:(i + 1) * local_len], dataset, i)) def init(env): os.environ = env with Pool(num_proc, initializer=init, initargs=(os.environ.copy(), )) as p: t = p.starmap(prepare_for_coco_segmentation, chunks) tmp = [] for i in range(num_proc): tmp += t[i] coco_results["segm"] = tmp #coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset) results = COCOResults(*iou_types) logger.info("Evaluating predictions") # Extracting only required metrics map_required = {} for iou_type in iou_types: with tempfile.NamedTemporaryFile() as f: [res, map] = evaluate_predictions_on_coco(dataset.coco, coco_results[iou_type], f.name, iou_type) map_required[iou_type] = map results.update(res) check_expected_results(res, expected_results, expected_results_sigma_tol) # returning results return map_required