Exemple #1
0
def mlperf_test_early_exit(iteration, iters_per_epoch, tester, model, distributed, min_bbox_map, min_segm_map):
    # Note: let iters / epoch == 10k, at iter 9999 we've finished epoch 0 and need to test
    if iteration > 0 and (iteration + 1)% iters_per_epoch == 0:
        epoch = iteration // iters_per_epoch

        print_mlperf(key=mlperf_log.EVAL_START, value=epoch)
        #print("tester "+str(tester))
        #print("model "+str(model))

        bbox_map, segm_map = test_and_exchange_map(tester, model, distributed)
        # necessary for correctness
        model.train()

        print_mlperf(key=mlperf_log.EVAL_TARGET, value={"BBOX": min_bbox_map,
                                                        "SEGM": min_segm_map})
        logger = logging.getLogger('maskrcnn_benchmark.trainer')
        logger.info('bbox mAP: {}, segm mAP: {}'.format(bbox_map, segm_map))

        print_mlperf(key=mlperf_log.EVAL_ACCURACY, value={"epoch" : epoch, "value":{"BBOX" : bbox_map, "SEGM" : segm_map}})
        print_mlperf(key=mlperf_log.EVAL_STOP)

        # terminating condition
        if bbox_map >= min_bbox_map and segm_map >= min_segm_map:
            logger.info("Target mAP reached, exiting...")
            print_mlperf(key=mlperf_log.RUN_STOP, value={"success":True})
            return True

        # At this point will start the next epoch, so note this in the log
        # print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch+1)
    return False
Exemple #2
0
def make_data_loader(cfg, is_train=True, is_distributed=False):
    if is_train:
        images_per_batch = cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN
        print_mlperf(key=mlperf_log.INPUT_ORDER)
        shuffle = True
    else:
        images_per_batch = cfg.DATALOADER.IMAGES_PER_BATCH_TEST
        shuffle = False if not is_distributed else True

    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    dataset = make_coco_dataset(cfg, is_train)
    sampler = make_data_sampler(dataset, shuffle, is_distributed)
    batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping,
                                            images_per_batch)
    collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
    num_workers = cfg.DATALOADER.NUM_WORKERS
    data_loader = torch.utils.data.DataLoader(dataset,
                                              num_workers=num_workers,
                                              batch_sampler=batch_sampler,
                                              collate_fn=collator,
                                              pin_memory=True)
    return data_loader
def mlperf_log_epoch_start(iteration, iters_per_epoch):
    # First iteration:
    #     Note we've started training & tag first epoch start
    if iteration == 0:
        print_mlperf(key=mlperf_log.TRAIN_LOOP)
        print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=0)
        return
    if iteration % iters_per_epoch == 0:
        epoch = iteration // iters_per_epoch
        print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch)
def main():
    mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__))

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if is_main_process:
        # Setting logging file parameters for compliance logging
        os.environ["COMPLIANCE_FILE"] = './MASKRCNN_complVv0.5.0_' + str(
            datetime.datetime.now())
        mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
        mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
        mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
        mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

        print_mlperf(key=mlperf_log.RUN_START)

        # setting seeds - needs to be timed, so after RUN_START
        if is_main_process():
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        else:
            seed_tensor = torch.tensor(0,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))

        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:
        print_mlperf(key=mlperf_log.RUN_START)
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    # actually use the random seed
    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args.local_rank, args.distributed)

    print_mlperf(key=mlperf_log.RUN_FINAL)
def train(cfg, local_rank, distributed):
    # Model logging
    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE,
                 value=cfg.SOLVER.IMS_PER_BATCH)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION,
                 value=cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD,
                 value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS,
                 value=cfg.MODEL.RPN.ASPECT_RATIOS)
    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)
    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY,
                 value=cfg.SOLVER.WEIGHT_DECAY)

    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=distributed,
            min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
            min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    else:
        per_iter_callback_fn = None

    start_train_time = time.time()

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        per_iter_start_callback_fn=functools.partial(
            mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print("&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format(
        (arguments["iteration"] * 1.0) / total_training_time))

    return model
Exemple #6
0
def train(cfg,
          random_number_generator,
          local_rank,
          distributed,
          args,
          fp16=False):

    data_loader = make_data_loader(cfg,
                                   is_train=True,
                                   is_distributed=distributed)

    # todo sharath - undocument log below after package is updated
    # print_mlperf(key=mlperf_log.INPUT_SIZE, value=len(data_loader.dataset))

    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE,
                 value=cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST,
                 value=cfg.DATALOADER.IMAGES_PER_BATCH_TEST)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION,
                 value=cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD,
                 value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS,
                 value=cfg.MODEL.RPN.ASPECT_RATIOS)

    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)

    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)

    model = build_detection_model(cfg)
    load_from_pretrained_checkpoint(cfg, model)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY,
                 value=cfg.SOLVER.WEIGHT_DECAY)

    scheduler = make_lr_scheduler(cfg, optimizer)
    max_iter = cfg.SOLVER.MAX_ITER

    if use_apex_amp:
        amp_handle = amp.init(enabled=fp16, verbose=False)
        if cfg.SOLVER.ACCUMULATE_GRAD:
            # also specify number of steps to accumulate over
            optimizer = amp_handle.wrap_optimizer(
                optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
        else:
            optimizer = amp_handle.wrap_optimizer(optimizer)

    if distributed:
        if use_apex_ddp:
            model = DDP(model, delay_allreduce=True)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)

    arguments = {}
    arguments["iteration"] = 0

    arguments["use_amp"] = use_apex_amp

    output_dir = cfg.OUTPUT_DIR

    if cfg.SAVE_CHECKPOINTS:
        checkpoint_file = cfg.CHECKPOINT
        checkpointer = Checkpoint(model, optimizer, scheduler, output_dir,
                                  local_rank)
        if checkpoint_file:
            extra_checkpoint_data = checkpointer.load(checkpoint_file)
            arguments.update(extra_checkpoint_data)
    else:
        checkpointer = None

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        max_iter,
        device,
        distributed,
        arguments,
        cfg,
        args,
        random_number_generator,
    )

    return model
Exemple #7
0
def main():

    mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__))
    # mlperf_log.LOGGER.propagate = False

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default=
        "/private/home/fmassa/github/detectron.pytorch/configs/rpn_r50.py",
        metavar="FILE",
        help="path to config file",
    )

    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument('--skip-test',
                        dest='skip_test',
                        help='Do not test the model',
                        action='store_true')
    parser.add_argument("--fp16",
                        action="store_true",
                        help="Enable multi-precision training")

    parser.add_argument("--min_bbox_map",
                        type=float,
                        default=0.377,
                        help="Target BBOX MAP")

    parser.add_argument("--min_mask_map",
                        type=float,
                        default=0.339,
                        help="Target SEGM/MASK MAP")

    parser.add_argument("--seed", type=int, default=1, help="Seed")

    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    args.distributed = (int(os.environ["WORLD_SIZE"]) > 1
                        if "WORLD_SIZE" in os.environ else False)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

        # to synchronize start of time
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
        torch.cuda.synchronize()

        if torch.distributed.get_rank() == 0:
            # Setting logging file parameters for compliance logging
            os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(
                datetime.datetime.now())
            mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
            mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
            mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
            mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

        print_mlperf(key=mlperf_log.RUN_START)

        # Setting seed
        seed_tensor = torch.tensor(0,
                                   dtype=torch.float32,
                                   device=torch.device("cuda"))

        if torch.distributed.get_rank() == 0:
            # seed = int(time.time())
            # random master seed, random.SystemRandom() uses /dev/urandom on Unix
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)

            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:

        # Setting logging file parameters for compliance logging
        os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(
            datetime.datetime.now())
        mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
        mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
        mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
        mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

        print_mlperf(key=mlperf_log.RUN_START)
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    if args.skip_test:
        cfg.DO_ONLINE_MAP_EVAL = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    #logger = setup_logger("maskrcnn_benchmark", output_dir, args.local_rank)
    logger = setup_logger("maskrcnn_benchmark", None, args.local_rank)
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, random_number_generator, args.local_rank,
                  args.distributed, args, args.fp16)
    print_mlperf(key=mlperf_log.RUN_FINAL)
Exemple #8
0
def train(cfg, local_rank, distributed):
    # Model logging
    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value = cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS)
    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)
    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)
    # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/modeling/detector/detectors.py
    # building bare mode without doing anthing
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY)


    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR
    print("output_dir "+str(output_dir))

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    # no such SAVE_CHECKPOINTS
    #arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS
    arguments["save_checkpoints"] = False

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"]
    )
    print("SSY iters_per_epoch "+str(iters_per_epoch))
    #print("SSY iters_per_epoch change to 100 ")
    #iters_per_epoch = 100

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    # SSY
    # I already add PER_EPOCH_EVAL and MIN_BBOX_MAP MIN_SEGM_MAP to  ./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
    # but it still can not find it
    # so I manually set them here
    #if cfg.PER_EPOCH_EVAL:
    #    per_iter_callback_fn = functools.partial(
    #            mlperf_test_early_exit,
    #            iters_per_epoch=iters_per_epoch,
    #            tester=functools.partial(test, cfg=cfg),
    #            model=model,
    #            distributed=distributed,
    #            min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
    #            min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    #else:
    #    per_iter_callback_fn = None
    per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/tester.py
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=distributed,
            min_bbox_map=0.377,
            min_segm_map=0.339)

    start_train_time = time.time()
    # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/trainer.py
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print(
            "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time)
    )

    return model
Exemple #9
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    max_iter,
    device,
    use_distributed,
    arguments,
    config,
    args,
    random_number_generator,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    start_training_time = time.time()

    print_mlperf(key=mlperf_log.TRAIN_LOOP)

    epoch = 0
    while arguments["iteration"] < max_iter:

        print_mlperf(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        start_epoch_time = time.time()
        iteration = arguments["iteration"]
        if use_distributed:

            # Using Random number generator with master seed to generate random seeds for every epoch
            iteration_seed = random_number_generator.randint(0, 2**32 - 1)
            data_loader.batch_sampler.sampler.set_epoch(iteration_seed)

        iteration_end = train_one_epoch(
            model, data_loader, optimizer, scheduler, device, iteration, max_iter, config,
            use_distributed, use_amp=arguments["use_amp"]
        )
        total_epoch_time = time.time() - start_epoch_time

        epoch_time_str = str(datetime.timedelta(seconds=total_epoch_time))
        logger.info(
            "Total epoch time: {} ({:.4f} s / it)".format(
                epoch_time_str, total_epoch_time / (iteration_end - iteration)
            )
        )
        arguments["iteration"] = iteration_end

        if checkpointer:
            checkpointer("model_{}".format(arguments["iteration"]), **arguments)


        if config.DO_ONLINE_MAP_EVAL:

            print_mlperf(key=mlperf_log.EVAL_START, value=epoch)
            results = test(config, model, use_distributed)

            print_mlperf(key=mlperf_log.EVAL_TARGET, value={"BBOX": 0.377,
                                                              "SEGM": 0.339})
            map_tensor = torch.tensor((0, 0), dtype=torch.float32, device=torch.device("cuda"))

            if results: #Rank 0 process
                bbox_map = results['bbox']
                mask_map = results['segm']
                map_tensor = torch.tensor((bbox_map, mask_map), dtype=torch.float32, device=torch.device("cuda"))

            if use_distributed:
                torch.distributed.broadcast(map_tensor, 0)
                bbox_map = map_tensor[0].item()
                mask_map = map_tensor[1].item()


            logger.info("bbox map: {} mask map: {}".format(bbox_map, mask_map))
            print_mlperf(key=mlperf_log.EVAL_ACCURACY, value={"epoch":epoch, "value":{"BBOX":bbox_map, "SEGM":mask_map}})
            print_mlperf(key=mlperf_log.EVAL_STOP)

            # Terminating condition
            if bbox_map >= args.min_bbox_map and mask_map >= args.min_mask_map:
                logger.info("Target MAP reached. Exiting...")
                print_mlperf(key=mlperf_log.RUN_STOP, value={"success":True})
                break
        epoch += 1

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (arguments["iteration"])
        )
    )
    logger.info(
        "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time)
        )
def inference(
        model,
        data_loader,
        iou_types=("bbox", ),
        box_only=False,
        device="cuda",
        expected_results=(),
        expected_results_sigma_tol=4,
):

    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = (torch.distributed.get_world_size()
                   if torch.distributed.is_initialized() else 1)
    logger = logging.getLogger("maskrcnn_benchmark.inference")
    dataset = data_loader.dataset

    print_mlperf(key=mlperf_log.EVAL_SIZE, value=len(dataset))

    logger.info("Start evaluation on {} images".format(len(dataset)))
    start_time = time.time()
    predictions = compute_on_dataset(model, data_loader, device)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Total inference time: {} ({} s / img per device, on {} devices)".
        format(total_time_str, total_time * num_devices / len(dataset),
               num_devices))

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not _is_main_process():
        return

    for p in predictions:
        p.add_field('mask1', p.get_field('mask').float())

    if box_only:
        logger.info("Evaluating bbox proposals")
        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
        res = COCOResults("box_proposal")
        for limit in [100, 1000]:
            for area, suffix in areas.items():
                stats = evaluate_box_proposals(predictions,
                                               dataset,
                                               area=area,
                                               limit=limit)
                key = "AR{}@{:d}".format(suffix, limit)
                res.results["box_proposal"][key] = stats["ar"].item()
        # logger.info("AR@1000: {}".format(results["ar"].item()))
        logger.info(res)
        check_expected_results(res, expected_results,
                               expected_results_sigma_tol)

        # return back result
        return res
    logger.info("Preparing results for COCO format")
    coco_results = {}
    if "bbox" in iou_types:
        logger.info("Preparing bbox results")
        coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset)
    if "segm" in iou_types:
        logger.info("Preparing segm results")
        num_proc = 20
        local_len = len(predictions) // num_proc
        chunks = []
        for i in range(num_proc):
            chunks.append(
                (predictions[i * local_len:(i + 1) * local_len], dataset, i))

        def init(env):
            os.environ = env

        with Pool(num_proc, initializer=init,
                  initargs=(os.environ.copy(), )) as p:
            t = p.starmap(prepare_for_coco_segmentation, chunks)

        tmp = []
        for i in range(num_proc):
            tmp += t[i]
        coco_results["segm"] = tmp

        #coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset)

    results = COCOResults(*iou_types)
    logger.info("Evaluating predictions")

    # Extracting only required metrics
    map_required = {}
    for iou_type in iou_types:
        with tempfile.NamedTemporaryFile() as f:
            [res, map] = evaluate_predictions_on_coco(dataset.coco,
                                                      coco_results[iou_type],
                                                      f.name, iou_type)
            map_required[iou_type] = map
            results.update(res)
    check_expected_results(res, expected_results, expected_results_sigma_tol)
    # returning results
    return map_required